elibrum 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "nokogiri", "~> 1.5.6"
4
+ gem "erubis", "~> 2.7.0"
data/README.md ADDED
@@ -0,0 +1,64 @@
1
+ Elibrum
2
+ =======
3
+
4
+ Elibrum converts webpages into ebooks. It extracts the text with Boilerpipe and builds the ebook with `ebook-convert`.
5
+
6
+ JRuby 1.9.2 or greater is required.
7
+
8
+ ## Installation
9
+
10
+ Install [Calibre](http://calibre-ebook.com/) to get `ebook-convert`.
11
+
12
+ ```
13
+ $ rvm install jruby
14
+ $ rvm use jruby
15
+ $ export JRUBY_OPTS=--1.9
16
+ $ gem install elibrum
17
+ ```
18
+
19
+ ## Usage
20
+
21
+ ```
22
+ require "elibrum"
23
+
24
+ # Must be set. Usually in the following location on OS X:
25
+ Elibrum::EBOOK_CONVERT_PATH = "/Applications/calibre.app/Contents/MacOS/ebook-convert"
26
+
27
+ Elibrum::Builder.new("git_tutorials", :epub) do
28
+ add "http://gitready.com/intermediate/2009/01/31/intro-to-rebase.html"
29
+ add "http://gitready.com/intermediate/2009/02/13/list-remote-branches.html"
30
+ add "http://gitready.com/advanced/2009/07/31/tig-the-ncurses-front-end-to-git.html"
31
+ end
32
+
33
+ links = []
34
+ links << "http://techcrunch.com/2013/01/25/eu-enlists-telefonica-cisco-hp-nokia-arm-and-others-to-close-the-700k-it-job-gap-in-europe/"
35
+ links << "http://techcrunch.com/2013/01/25/h265-is-approved/"
36
+ links << "http://techcrunch.com/2013/01/24/nokia-confirms-the-pure-view-was-officially-the-last-symbian-phone/"
37
+
38
+ Elibrum::Builder.new("blog_posts", [:epub, :mobi, :pdf]) do |b|
39
+ b.title = "Some Blog Posts"
40
+ b.author = "TechCrunch"
41
+
42
+ b.add *links do |a|
43
+ a.extractor = :largest_content
44
+ a.modify do |title, text|
45
+ title = title.split(" | ").first.strip
46
+ text = text.split("Comments").last
47
+ [title, text]
48
+ end
49
+ end
50
+ end
51
+
52
+ # Frontpage of Hacker News as an ebook!
53
+ require "ruby-hackernews"
54
+ Elibrum::Builder.new("frontpage", [:epub, :pdf]) do
55
+ links = RubyHackernews::Entry.all.map{|e| e.link.href}.reject{|l| l[0...4] != "http"}
56
+ add *links
57
+ end
58
+ ```
59
+
60
+ ## TODO
61
+
62
+ 1. Add tests
63
+ 1. Modify Boilerpipe to send user agent and accept a string as input (see Webpage#text)
64
+ 1. Use [gae-boilerpipe](https://github.com/gregbayer/gae-boilerpipe) to make the project pure Ruby
data/elibrum.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require "elibrum/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "elibrum"
6
+ s.version = Elibrum::VERSION
7
+ s.authors = ["Dave Sescleifer"]
8
+ s.summary = "Converts webpages into ebooks"
9
+ s.description = "Converts webpages into ebooks."
10
+ s.email = "dave@sescleifer.com"
11
+ s.platform = "java"
12
+ s.homepage = "http://github.com/dsesclei/elibrum"
13
+ s.require_paths = ["lib"]
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+
18
+ s.add_runtime_dependency "nokogiri", ["~> 1.5.6"]
19
+ s.add_runtime_dependency "erubis", ["~> 2.7.0"]
20
+ end
@@ -0,0 +1,20 @@
1
+ module Elibrum
2
+ class Book
3
+ attr_accessor :title, :author
4
+ attr_reader :pages
5
+
6
+ def initialize(filename, &block)
7
+ @title = filename.split(/_|-/).map(&:capitalize).join(" ")
8
+ @author = "Unknown"
9
+ @pages = []
10
+
11
+ block.arity < 1 ? instance_eval(&block) : block.call(self)
12
+ end
13
+
14
+ def add(*links, &block)
15
+ links.each do |url|
16
+ @pages << Webpage.new(url, &block)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,40 @@
1
+ module Elibrum
2
+ class Builder
3
+ def initialize(filename, formats, template=default_template, &block)
4
+ raise "Elibrum::EBOOK_CONVERT_PATH is not defined. Put the path to ebook-convert in this constant." unless defined?(EBOOK_CONVERT_PATH)
5
+
6
+ FileUtils.mkdir(".images-temp") unless File.directory?(".images-temp")
7
+ @book = Book.new(filename, &block)
8
+ @template = template
9
+
10
+ [*formats].each do |format|
11
+ build(filename, format)
12
+ end
13
+
14
+ FileUtils.rm_rf(".images-temp")
15
+ end
16
+
17
+ def build(filename, format)
18
+ # Create an HTML file to use as the source file for conversion
19
+ File.open(".#{filename}-temp.html", "w+") {|f| f.write(html)}
20
+ # Documentation for ebook-convert can be found here: http://manual.calibre-ebook.com/cli/ebook-convert.html
21
+ `#{EBOOK_CONVERT_PATH} .#{filename}-temp.html #{filename}.#{format} --title "#{@book.title}" --authors "#{@book.author}" --chapter "//h:pagebreak"`
22
+ #FileUtils.rm(filename + "-temp.html")
23
+ end
24
+
25
+ private
26
+
27
+ def html
28
+ @html ||= begin
29
+ template_source = File.read(@template)
30
+ html = @book.pages.inject("") do |ret, page|
31
+ ret += Erubis::Eruby.new(template_source).result(title: page.title, text: page.text, url: page.url)
32
+ end
33
+ end
34
+ end
35
+
36
+ def default_template
37
+ File.expand_path("default.erb", File.dirname(__FILE__))
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,3 @@
1
+ <pagebreak /> <!-- This is a marker for ebook-convert -->
2
+ <h1><%= title %></h1>
3
+ <span><%= text %></span>
@@ -0,0 +1,18 @@
1
+
2
+ boilerpipe
3
+
4
+ Copyright (c) 2009-2011 Christian Kohlschütter
5
+
6
+ The author licenses this file to You under the Apache License, Version 2.0
7
+ (the "License"); you may not use this file except in compliance with
8
+ the License. You may obtain a copy of the License at
9
+
10
+ http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+
18
+
@@ -0,0 +1,24 @@
1
+
2
+ boilerpipe
3
+
4
+ Copyright (c) 2009-2011 Christian Kohlschütter
5
+
6
+ The author licenses this file to You under the Apache License, Version 2.0
7
+ (the "License"); you may not use this file except in compliance with
8
+ the License. You may obtain a copy of the License at
9
+
10
+ http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+
18
+
19
+ This software contains the following parts which are also provided
20
+ under the Apache License 2.0 (http://apache.org/licenses/LICENSE-2.0.txt):
21
+
22
+ - NekoHTML
23
+ - Xerces
24
+
Binary file
@@ -0,0 +1,3 @@
1
+ module Elibrum
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,94 @@
1
+ module Elibrum
2
+ class Webpage
3
+ attr_reader :url, :extractor
4
+
5
+ def initialize(url, &block)
6
+ # Delete the trailing / so that URI.join in localize_images doesn't confuse this URL for a directory
7
+ @url = url[-1] == "/" ? url[0...-1] : url
8
+ @modify = proc {|title, text| [title, text]}
9
+ @extractor = CommonExtractors::ARTICLE_EXTRACTOR
10
+
11
+ if block
12
+ block.arity < 1 ? instance_eval(&block) : block.call(self)
13
+ end
14
+
15
+ @title, @text = @modify.call(title, text)
16
+ @text = localize_images(@text)
17
+ end
18
+
19
+ def title
20
+ @title ||= Nokogiri::HTML(content).xpath("//title").text
21
+ end
22
+
23
+ def text
24
+ # The page is loaded both here and in Webpage#content.
25
+ # TODO: Modify process() to accept a string as input.
26
+ @text ||= begin
27
+ highlighter = HTMLHighlighter.newExtractingInstance(true, false)
28
+ highlighter.process(URL.new(@url), @extractor)
29
+ rescue Exception => e
30
+ # Boilerpipe does not pass along a user agent when retrieving sites, so some return a 403
31
+ msg = "Failed to load #{@url} (#{e})"
32
+ puts msg
33
+ msg
34
+ end
35
+ end
36
+
37
+ def extractor=(extractor_type)
38
+ @extractor = case extractor_type
39
+ when :article
40
+ CommonExtractors::ARTICLE_EXTRACTOR
41
+ when :canola
42
+ CommonExtractors::CANOLA_EXTRACTOR
43
+ when :default
44
+ CommonExtractors::DEFAULT_EXTRACTOR
45
+ when :keep_everything
46
+ CommonExtractors::KEEP_EVERYTHING_EXTRACTOR
47
+ when :largest_content
48
+ CommonExtractors::LARGEST_CONTENT_EXTRACTOR
49
+ else
50
+ raise "Invalid extractor: #{extractor_type}"
51
+ end
52
+ end
53
+
54
+ # Allows post-extraction processing of the text.
55
+ # Useful for removing bits that the extractor accidentally included.
56
+ def modify(&block)
57
+ @modify = block
58
+ end
59
+
60
+ private
61
+
62
+ def content
63
+ @content ||= open(@url).read
64
+ end
65
+
66
+ # Download the images to a local folder so that they will appear in the ebook.
67
+ def localize_images(html)
68
+ noko = Nokogiri::HTML(html)
69
+ noko.xpath("//img/@src").each do |url|
70
+ filetype = url.value.split(".").last.split("?").first
71
+ filename = ""
72
+ loop do
73
+ filename = ".images-temp/" + SecureRandom.hex + "." + filetype
74
+ break unless File.exists?(filename)
75
+ end
76
+
77
+ # Convert all relative paths to absolute ones so that we can download the images.
78
+ absolute_url = URI.join(@url, url.value).to_s
79
+
80
+ begin
81
+ url.value = filename
82
+ File.open(filename, "wb") do |f|
83
+ img = open(absolute_url).read
84
+ f.write(img)
85
+ end
86
+ rescue Exception => e
87
+ puts "Failed to load #{absolute_url} (#{e})"
88
+ end
89
+ end
90
+
91
+ noko.to_html
92
+ end
93
+ end
94
+ end
data/lib/elibrum.rb ADDED
@@ -0,0 +1,21 @@
1
+ raise "Elibrum only works on JRuby at the moment." unless RUBY_PLATFORM =~ /java/
2
+
3
+ require "open-uri"
4
+ require "uri"
5
+ require "FileUtils"
6
+ require "SecureRandom"
7
+ require "erubis"
8
+ require "nokogiri"
9
+ require "java"
10
+ require "elibrum/jars/boilerpipe-1.2.0.jar"
11
+ require "elibrum/jars/nekohtml-1.9.13.jar"
12
+ require "elibrum/jars/xerces-2.9.1.jar"
13
+
14
+ java_import "de.l3s.boilerpipe.extractors.CommonExtractors"
15
+ java_import "de.l3s.boilerpipe.sax.HTMLHighlighter"
16
+ java_import "de.l3s.boilerpipe.sax.HTMLDocument"
17
+ java_import java.net.URL
18
+
19
+ require "elibrum/builder"
20
+ require "elibrum/book"
21
+ require "elibrum/webpage"
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: elibrum
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: java
7
+ authors:
8
+ - Dave Sescleifer
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ~>
19
+ - !ruby/object:Gem::Version
20
+ version: 1.5.6
21
+ none: false
22
+ requirement: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.5.6
27
+ none: false
28
+ prerelease: false
29
+ type: :runtime
30
+ - !ruby/object:Gem::Dependency
31
+ name: erubis
32
+ version_requirements: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ~>
35
+ - !ruby/object:Gem::Version
36
+ version: 2.7.0
37
+ none: false
38
+ requirement: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ~>
41
+ - !ruby/object:Gem::Version
42
+ version: 2.7.0
43
+ none: false
44
+ prerelease: false
45
+ type: :runtime
46
+ description: Converts webpages into ebooks.
47
+ email: dave@sescleifer.com
48
+ executables: []
49
+ extensions: []
50
+ extra_rdoc_files: []
51
+ files:
52
+ - Gemfile
53
+ - README.md
54
+ - elibrum.gemspec
55
+ - lib/elibrum.rb
56
+ - lib/elibrum/book.rb
57
+ - lib/elibrum/builder.rb
58
+ - lib/elibrum/default.erb
59
+ - lib/elibrum/jars/LICENSE.txt
60
+ - lib/elibrum/jars/NOTICE.txt
61
+ - lib/elibrum/jars/boilerpipe-1.2.0.jar
62
+ - lib/elibrum/jars/nekohtml-1.9.13.jar
63
+ - lib/elibrum/jars/xerces-2.9.1.jar
64
+ - lib/elibrum/version.rb
65
+ - lib/elibrum/webpage.rb
66
+ homepage: http://github.com/dsesclei/elibrum
67
+ licenses: []
68
+ post_install_message:
69
+ rdoc_options: []
70
+ require_paths:
71
+ - lib
72
+ required_ruby_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ none: false
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ none: false
84
+ requirements: []
85
+ rubyforge_project:
86
+ rubygems_version: 1.8.24
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: Converts webpages into ebooks
90
+ test_files: []
91
+ ...