elibrum 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "nokogiri", "~> 1.5.6"
4
+ gem "erubis", "~> 2.7.0"
data/README.md ADDED
@@ -0,0 +1,64 @@
1
+ Elibrum
2
+ =======
3
+
4
+ Elibrum converts webpages into ebooks. It extracts the text with Boilerpipe and builds the ebook with `ebook-convert`.
5
+
6
+ JRuby 1.9.2 or greater is required.
7
+
8
+ ## Installation
9
+
10
+ Install [Calibre](http://calibre-ebook.com/) to get `ebook-convert`.
11
+
12
+ ```
13
+ $ rvm install jruby
14
+ $ rvm use jruby
15
+ $ export JRUBY_OPTS=--1.9
16
+ $ gem install elibrum
17
+ ```
18
+
19
+ ## Usage
20
+
21
+ ```
22
+ require "elibrum"
23
+
24
+ # Must be set. Usually in the following location on OS X:
25
+ Elibrum::EBOOK_CONVERT_PATH = "/Applications/calibre.app/Contents/MacOS/ebook-convert"
26
+
27
+ Elibrum::Builder.new("git_tutorials", :epub) do
28
+ add "http://gitready.com/intermediate/2009/01/31/intro-to-rebase.html"
29
+ add "http://gitready.com/intermediate/2009/02/13/list-remote-branches.html"
30
+ add "http://gitready.com/advanced/2009/07/31/tig-the-ncurses-front-end-to-git.html"
31
+ end
32
+
33
+ links = []
34
+ links << "http://techcrunch.com/2013/01/25/eu-enlists-telefonica-cisco-hp-nokia-arm-and-others-to-close-the-700k-it-job-gap-in-europe/"
35
+ links << "http://techcrunch.com/2013/01/25/h265-is-approved/"
36
+ links << "http://techcrunch.com/2013/01/24/nokia-confirms-the-pure-view-was-officially-the-last-symbian-phone/"
37
+
38
+ Elibrum::Builder.new("blog_posts", [:epub, :mobi, :pdf]) do |b|
39
+ b.title = "Some Blog Posts"
40
+ b.author = "TechCrunch"
41
+
42
+ b.add *links do |a|
43
+ a.extractor = :largest_content
44
+ a.modify do |title, text|
45
+ title = title.split(" | ").first.strip
46
+ text = text.split("Comments").last
47
+ [title, text]
48
+ end
49
+ end
50
+ end
51
+
52
+ # Frontpage of Hacker News as an ebook!
53
+ require "ruby-hackernews"
54
+ Elibrum::Builder.new("frontpage", [:epub, :pdf]) do
55
+ links = RubyHackernews::Entry.all.map{|e| e.link.href}.reject{|l| l[0...4] != "http"}
56
+ add *links
57
+ end
58
+ ```
59
+
60
+ ## TODO
61
+
62
+ 1. Add tests
63
+ 1. Modify Boilerpipe to send user agent and accept a string as input (see Webpage#text)
64
+ 1. Use [gae-boilerpipe](https://github.com/gregbayer/gae-boilerpipe) to make the project pure Ruby
data/elibrum.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require "elibrum/version"
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "elibrum"
6
+ s.version = Elibrum::VERSION
7
+ s.authors = ["Dave Sescleifer"]
8
+ s.summary = "Converts webpages into ebooks"
9
+ s.description = "Converts webpages into ebooks."
10
+ s.email = "dave@sescleifer.com"
11
+ s.platform = "java"
12
+ s.homepage = "http://github.com/dsesclei/elibrum"
13
+ s.require_paths = ["lib"]
14
+ s.files = `git ls-files`.split("\n")
15
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
16
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
17
+
18
+ s.add_runtime_dependency "nokogiri", ["~> 1.5.6"]
19
+ s.add_runtime_dependency "erubis", ["~> 2.7.0"]
20
+ end
@@ -0,0 +1,20 @@
1
+ module Elibrum
2
+ class Book
3
+ attr_accessor :title, :author
4
+ attr_reader :pages
5
+
6
+ def initialize(filename, &block)
7
+ @title = filename.split(/_|-/).map(&:capitalize).join(" ")
8
+ @author = "Unknown"
9
+ @pages = []
10
+
11
+ block.arity < 1 ? instance_eval(&block) : block.call(self)
12
+ end
13
+
14
+ def add(*links, &block)
15
+ links.each do |url|
16
+ @pages << Webpage.new(url, &block)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,40 @@
1
+ module Elibrum
2
+ class Builder
3
+ def initialize(filename, formats, template=default_template, &block)
4
+ raise "Elibrum::EBOOK_CONVERT_PATH is not defined. Put the path to ebook-convert in this constant." unless defined?(EBOOK_CONVERT_PATH)
5
+
6
+ FileUtils.mkdir(".images-temp") unless File.directory?(".images-temp")
7
+ @book = Book.new(filename, &block)
8
+ @template = template
9
+
10
+ [*formats].each do |format|
11
+ build(filename, format)
12
+ end
13
+
14
+ FileUtils.rm_rf(".images-temp")
15
+ end
16
+
17
+ def build(filename, format)
18
+ # Create an HTML file to use as the source file for conversion
19
+ File.open(".#{filename}-temp.html", "w+") {|f| f.write(html)}
20
+ # Documentation for ebook-convert can be found here: http://manual.calibre-ebook.com/cli/ebook-convert.html
21
+ `#{EBOOK_CONVERT_PATH} .#{filename}-temp.html #{filename}.#{format} --title "#{@book.title}" --authors "#{@book.author}" --chapter "//h:pagebreak"`
22
+ #FileUtils.rm(filename + "-temp.html")
23
+ end
24
+
25
+ private
26
+
27
+ def html
28
+ @html ||= begin
29
+ template_source = File.read(@template)
30
+ html = @book.pages.inject("") do |ret, page|
31
+ ret += Erubis::Eruby.new(template_source).result(title: page.title, text: page.text, url: page.url)
32
+ end
33
+ end
34
+ end
35
+
36
+ def default_template
37
+ File.expand_path("default.erb", File.dirname(__FILE__))
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,3 @@
1
+ <pagebreak /> <!-- This is a marker for ebook-convert -->
2
+ <h1><%= title %></h1>
3
+ <span><%= text %></span>
@@ -0,0 +1,18 @@
1
+
2
+ boilerpipe
3
+
4
+ Copyright (c) 2009-2011 Christian Kohlschütter
5
+
6
+ The author licenses this file to You under the Apache License, Version 2.0
7
+ (the "License"); you may not use this file except in compliance with
8
+ the License. You may obtain a copy of the License at
9
+
10
+ http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+
18
+
@@ -0,0 +1,24 @@
1
+
2
+ boilerpipe
3
+
4
+ Copyright (c) 2009-2011 Christian Kohlschütter
5
+
6
+ The author licenses this file to You under the Apache License, Version 2.0
7
+ (the "License"); you may not use this file except in compliance with
8
+ the License. You may obtain a copy of the License at
9
+
10
+ http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+
18
+
19
+ This software contains the following parts which are also provided
20
+ under the Apache License 2.0 (http://apache.org/licenses/LICENSE-2.0.txt):
21
+
22
+ - NekoHTML
23
+ - Xerces
24
+
Binary file
@@ -0,0 +1,3 @@
1
+ module Elibrum
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,94 @@
1
+ module Elibrum
2
+ class Webpage
3
+ attr_reader :url, :extractor
4
+
5
+ def initialize(url, &block)
6
+ # Delete the trailing / so that URI.join in localize_images doesn't confuse this URL for a directory
7
+ @url = url[-1] == "/" ? url[0...-1] : url
8
+ @modify = proc {|title, text| [title, text]}
9
+ @extractor = CommonExtractors::ARTICLE_EXTRACTOR
10
+
11
+ if block
12
+ block.arity < 1 ? instance_eval(&block) : block.call(self)
13
+ end
14
+
15
+ @title, @text = @modify.call(title, text)
16
+ @text = localize_images(@text)
17
+ end
18
+
19
+ def title
20
+ @title ||= Nokogiri::HTML(content).xpath("//title").text
21
+ end
22
+
23
+ def text
24
+ # The page is loaded both here and in Webpage#content.
25
+ # TODO: Modify process() to accept a string as input.
26
+ @text ||= begin
27
+ highlighter = HTMLHighlighter.newExtractingInstance(true, false)
28
+ highlighter.process(URL.new(@url), @extractor)
29
+ rescue Exception => e
30
+ # Boilerpipe does not pass along a user agent when retrieving sites, so some return a 403
31
+ msg = "Failed to load #{@url} (#{e})"
32
+ puts msg
33
+ msg
34
+ end
35
+ end
36
+
37
+ def extractor=(extractor_type)
38
+ @extractor = case extractor_type
39
+ when :article
40
+ CommonExtractors::ARTICLE_EXTRACTOR
41
+ when :canola
42
+ CommonExtractors::CANOLA_EXTRACTOR
43
+ when :default
44
+ CommonExtractors::DEFAULT_EXTRACTOR
45
+ when :keep_everything
46
+ CommonExtractors::KEEP_EVERYTHING_EXTRACTOR
47
+ when :largest_content
48
+ CommonExtractors::LARGEST_CONTENT_EXTRACTOR
49
+ else
50
+ raise "Invalid extractor: #{extractor_type}"
51
+ end
52
+ end
53
+
54
+ # Allows post-extraction processing of the text.
55
+ # Useful for removing bits that the extractor accidentally included.
56
+ def modify(&block)
57
+ @modify = block
58
+ end
59
+
60
+ private
61
+
62
+ def content
63
+ @content ||= open(@url).read
64
+ end
65
+
66
+ # Download the images to a local folder so that they will appear in the ebook.
67
+ def localize_images(html)
68
+ noko = Nokogiri::HTML(html)
69
+ noko.xpath("//img/@src").each do |url|
70
+ filetype = url.value.split(".").last.split("?").first
71
+ filename = ""
72
+ loop do
73
+ filename = ".images-temp/" + SecureRandom.hex + "." + filetype
74
+ break unless File.exists?(filename)
75
+ end
76
+
77
+ # Convert all relative paths to absolute ones so that we can download the images.
78
+ absolute_url = URI.join(@url, url.value).to_s
79
+
80
+ begin
81
+ url.value = filename
82
+ File.open(filename, "wb") do |f|
83
+ img = open(absolute_url).read
84
+ f.write(img)
85
+ end
86
+ rescue Exception => e
87
+ puts "Failed to load #{absolute_url} (#{e})"
88
+ end
89
+ end
90
+
91
+ noko.to_html
92
+ end
93
+ end
94
+ end
data/lib/elibrum.rb ADDED
@@ -0,0 +1,21 @@
1
+ raise "Elibrum only works on JRuby at the moment." unless RUBY_PLATFORM =~ /java/
2
+
3
+ require "open-uri"
4
+ require "uri"
5
+ require "FileUtils"
6
+ require "SecureRandom"
7
+ require "erubis"
8
+ require "nokogiri"
9
+ require "java"
10
+ require "elibrum/jars/boilerpipe-1.2.0.jar"
11
+ require "elibrum/jars/nekohtml-1.9.13.jar"
12
+ require "elibrum/jars/xerces-2.9.1.jar"
13
+
14
+ java_import "de.l3s.boilerpipe.extractors.CommonExtractors"
15
+ java_import "de.l3s.boilerpipe.sax.HTMLHighlighter"
16
+ java_import "de.l3s.boilerpipe.sax.HTMLDocument"
17
+ java_import java.net.URL
18
+
19
+ require "elibrum/builder"
20
+ require "elibrum/book"
21
+ require "elibrum/webpage"
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: elibrum
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: java
7
+ authors:
8
+ - Dave Sescleifer
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-28 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ~>
19
+ - !ruby/object:Gem::Version
20
+ version: 1.5.6
21
+ none: false
22
+ requirement: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.5.6
27
+ none: false
28
+ prerelease: false
29
+ type: :runtime
30
+ - !ruby/object:Gem::Dependency
31
+ name: erubis
32
+ version_requirements: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - ~>
35
+ - !ruby/object:Gem::Version
36
+ version: 2.7.0
37
+ none: false
38
+ requirement: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ~>
41
+ - !ruby/object:Gem::Version
42
+ version: 2.7.0
43
+ none: false
44
+ prerelease: false
45
+ type: :runtime
46
+ description: Converts webpages into ebooks.
47
+ email: dave@sescleifer.com
48
+ executables: []
49
+ extensions: []
50
+ extra_rdoc_files: []
51
+ files:
52
+ - Gemfile
53
+ - README.md
54
+ - elibrum.gemspec
55
+ - lib/elibrum.rb
56
+ - lib/elibrum/book.rb
57
+ - lib/elibrum/builder.rb
58
+ - lib/elibrum/default.erb
59
+ - lib/elibrum/jars/LICENSE.txt
60
+ - lib/elibrum/jars/NOTICE.txt
61
+ - lib/elibrum/jars/boilerpipe-1.2.0.jar
62
+ - lib/elibrum/jars/nekohtml-1.9.13.jar
63
+ - lib/elibrum/jars/xerces-2.9.1.jar
64
+ - lib/elibrum/version.rb
65
+ - lib/elibrum/webpage.rb
66
+ homepage: http://github.com/dsesclei/elibrum
67
+ licenses: []
68
+ post_install_message:
69
+ rdoc_options: []
70
+ require_paths:
71
+ - lib
72
+ required_ruby_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ none: false
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ none: false
84
+ requirements: []
85
+ rubyforge_project:
86
+ rubygems_version: 1.8.24
87
+ signing_key:
88
+ specification_version: 3
89
+ summary: Converts webpages into ebooks
90
+ test_files: []
91
+ ...