elibrum 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -0
- data/README.md +64 -0
- data/elibrum.gemspec +20 -0
- data/lib/elibrum/book.rb +20 -0
- data/lib/elibrum/builder.rb +40 -0
- data/lib/elibrum/default.erb +3 -0
- data/lib/elibrum/jars/LICENSE.txt +18 -0
- data/lib/elibrum/jars/NOTICE.txt +24 -0
- data/lib/elibrum/jars/boilerpipe-1.2.0.jar +0 -0
- data/lib/elibrum/jars/nekohtml-1.9.13.jar +0 -0
- data/lib/elibrum/jars/xerces-2.9.1.jar +0 -0
- data/lib/elibrum/version.rb +3 -0
- data/lib/elibrum/webpage.rb +94 -0
- data/lib/elibrum.rb +21 -0
- metadata +91 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
Elibrum
|
2
|
+
=======
|
3
|
+
|
4
|
+
Elibrum converts webpages into ebooks. It extracts the text with Boilerpipe and builds the ebook with `ebook-convert`.
|
5
|
+
|
6
|
+
JRuby 1.9.2 or greater is required.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Install [Calibre](http://calibre-ebook.com/) to get `ebook-convert`.
|
11
|
+
|
12
|
+
```
|
13
|
+
$ rvm install jruby
|
14
|
+
$ rvm use jruby
|
15
|
+
$ export JRUBY_OPTS=--1.9
|
16
|
+
$ gem install elibrum
|
17
|
+
```
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
```
|
22
|
+
require "elibrum"
|
23
|
+
|
24
|
+
# Must be set. Usually in the following location on OS X:
|
25
|
+
Elibrum::EBOOK_CONVERT_PATH = "/Applications/calibre.app/Contents/MacOS/ebook-convert"
|
26
|
+
|
27
|
+
Elibrum::Builder.new("git_tutorials", :epub) do
|
28
|
+
add "http://gitready.com/intermediate/2009/01/31/intro-to-rebase.html"
|
29
|
+
add "http://gitready.com/intermediate/2009/02/13/list-remote-branches.html"
|
30
|
+
add "http://gitready.com/advanced/2009/07/31/tig-the-ncurses-front-end-to-git.html"
|
31
|
+
end
|
32
|
+
|
33
|
+
links = []
|
34
|
+
links << "http://techcrunch.com/2013/01/25/eu-enlists-telefonica-cisco-hp-nokia-arm-and-others-to-close-the-700k-it-job-gap-in-europe/"
|
35
|
+
links << "http://techcrunch.com/2013/01/25/h265-is-approved/"
|
36
|
+
links << "http://techcrunch.com/2013/01/24/nokia-confirms-the-pure-view-was-officially-the-last-symbian-phone/"
|
37
|
+
|
38
|
+
Elibrum::Builder.new("blog_posts", [:epub, :mobi, :pdf]) do |b|
|
39
|
+
b.title = "Some Blog Posts"
|
40
|
+
b.author = "TechCrunch"
|
41
|
+
|
42
|
+
b.add *links do |a|
|
43
|
+
a.extractor = :largest_content
|
44
|
+
a.modify do |title, text|
|
45
|
+
title = title.split(" | ").first.strip
|
46
|
+
text = text.split("Comments").last
|
47
|
+
[title, text]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Frontpage of Hacker News as an ebook!
|
53
|
+
require "ruby-hackernews"
|
54
|
+
Elibrum::Builder.new("frontpage", [:epub, :pdf]) do
|
55
|
+
links = RubyHackernews::Entry.all.map{|e| e.link.href}.reject{|l| l[0...4] != "http"}
|
56
|
+
add *links
|
57
|
+
end
|
58
|
+
```
|
59
|
+
|
60
|
+
## TODO
|
61
|
+
|
62
|
+
1. Add tests
|
63
|
+
1. Modify Boilerpipe to send user agent and accept a string as input (see Webpage#text)
|
64
|
+
1. Use [gae-boilerpipe](https://github.com/gregbayer/gae-boilerpipe) to make the project pure Ruby
|
data/elibrum.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require "elibrum/version"
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "elibrum"
|
6
|
+
s.version = Elibrum::VERSION
|
7
|
+
s.authors = ["Dave Sescleifer"]
|
8
|
+
s.summary = "Converts webpages into ebooks"
|
9
|
+
s.description = "Converts webpages into ebooks."
|
10
|
+
s.email = "dave@sescleifer.com"
|
11
|
+
s.platform = "java"
|
12
|
+
s.homepage = "http://github.com/dsesclei/elibrum"
|
13
|
+
s.require_paths = ["lib"]
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
|
18
|
+
s.add_runtime_dependency "nokogiri", ["~> 1.5.6"]
|
19
|
+
s.add_runtime_dependency "erubis", ["~> 2.7.0"]
|
20
|
+
end
|
data/lib/elibrum/book.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module Elibrum
|
2
|
+
class Book
|
3
|
+
attr_accessor :title, :author
|
4
|
+
attr_reader :pages
|
5
|
+
|
6
|
+
def initialize(filename, &block)
|
7
|
+
@title = filename.split(/_|-/).map(&:capitalize).join(" ")
|
8
|
+
@author = "Unknown"
|
9
|
+
@pages = []
|
10
|
+
|
11
|
+
block.arity < 1 ? instance_eval(&block) : block.call(self)
|
12
|
+
end
|
13
|
+
|
14
|
+
def add(*links, &block)
|
15
|
+
links.each do |url|
|
16
|
+
@pages << Webpage.new(url, &block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Elibrum
|
2
|
+
class Builder
|
3
|
+
def initialize(filename, formats, template=default_template, &block)
|
4
|
+
raise "Elibrum::EBOOK_CONVERT_PATH is not defined. Put the path to ebook-convert in this constant." unless defined?(EBOOK_CONVERT_PATH)
|
5
|
+
|
6
|
+
FileUtils.mkdir(".images-temp") unless File.directory?(".images-temp")
|
7
|
+
@book = Book.new(filename, &block)
|
8
|
+
@template = template
|
9
|
+
|
10
|
+
[*formats].each do |format|
|
11
|
+
build(filename, format)
|
12
|
+
end
|
13
|
+
|
14
|
+
FileUtils.rm_rf(".images-temp")
|
15
|
+
end
|
16
|
+
|
17
|
+
def build(filename, format)
|
18
|
+
# Create an HTML file to use as the source file for conversion
|
19
|
+
File.open(".#{filename}-temp.html", "w+") {|f| f.write(html)}
|
20
|
+
# Documentation for ebook-convert can be found here: http://manual.calibre-ebook.com/cli/ebook-convert.html
|
21
|
+
`#{EBOOK_CONVERT_PATH} .#{filename}-temp.html #{filename}.#{format} --title "#{@book.title}" --authors "#{@book.author}" --chapter "//h:pagebreak"`
|
22
|
+
#FileUtils.rm(filename + "-temp.html")
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def html
|
28
|
+
@html ||= begin
|
29
|
+
template_source = File.read(@template)
|
30
|
+
html = @book.pages.inject("") do |ret, page|
|
31
|
+
ret += Erubis::Eruby.new(template_source).result(title: page.title, text: page.text, url: page.url)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def default_template
|
37
|
+
File.expand_path("default.erb", File.dirname(__FILE__))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
|
2
|
+
boilerpipe
|
3
|
+
|
4
|
+
Copyright (c) 2009-2011 Christian Kohlschütter
|
5
|
+
|
6
|
+
The author licenses this file to You under the Apache License, Version 2.0
|
7
|
+
(the "License"); you may not use this file except in compliance with
|
8
|
+
the License. You may obtain a copy of the License at
|
9
|
+
|
10
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
|
12
|
+
Unless required by applicable law or agreed to in writing, software
|
13
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
See the License for the specific language governing permissions and
|
16
|
+
limitations under the License.
|
17
|
+
|
18
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
|
2
|
+
boilerpipe
|
3
|
+
|
4
|
+
Copyright (c) 2009-2011 Christian Kohlschütter
|
5
|
+
|
6
|
+
The author licenses this file to You under the Apache License, Version 2.0
|
7
|
+
(the "License"); you may not use this file except in compliance with
|
8
|
+
the License. You may obtain a copy of the License at
|
9
|
+
|
10
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
|
12
|
+
Unless required by applicable law or agreed to in writing, software
|
13
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
See the License for the specific language governing permissions and
|
16
|
+
limitations under the License.
|
17
|
+
|
18
|
+
|
19
|
+
This software contains the following parts which are also provided
|
20
|
+
under the Apache License 2.0 (http://apache.org/licenses/LICENSE-2.0.txt):
|
21
|
+
|
22
|
+
- NekoHTML
|
23
|
+
- Xerces
|
24
|
+
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module Elibrum
|
2
|
+
class Webpage
|
3
|
+
attr_reader :url, :extractor
|
4
|
+
|
5
|
+
def initialize(url, &block)
|
6
|
+
# Delete the trailing / so that URI.join in localize_images doesn't confuse this URL for a directory
|
7
|
+
@url = url[-1] == "/" ? url[0...-1] : url
|
8
|
+
@modify = proc {|title, text| [title, text]}
|
9
|
+
@extractor = CommonExtractors::ARTICLE_EXTRACTOR
|
10
|
+
|
11
|
+
if block
|
12
|
+
block.arity < 1 ? instance_eval(&block) : block.call(self)
|
13
|
+
end
|
14
|
+
|
15
|
+
@title, @text = @modify.call(title, text)
|
16
|
+
@text = localize_images(@text)
|
17
|
+
end
|
18
|
+
|
19
|
+
def title
|
20
|
+
@title ||= Nokogiri::HTML(content).xpath("//title").text
|
21
|
+
end
|
22
|
+
|
23
|
+
def text
|
24
|
+
# The page is loaded both here and in Webpage#content.
|
25
|
+
# TODO: Modify process() to accept a string as input.
|
26
|
+
@text ||= begin
|
27
|
+
highlighter = HTMLHighlighter.newExtractingInstance(true, false)
|
28
|
+
highlighter.process(URL.new(@url), @extractor)
|
29
|
+
rescue Exception => e
|
30
|
+
# Boilerpipe does not pass along a user agent when retrieving sites, so some return a 403
|
31
|
+
msg = "Failed to load #{@url} (#{e})"
|
32
|
+
puts msg
|
33
|
+
msg
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def extractor=(extractor_type)
|
38
|
+
@extractor = case extractor_type
|
39
|
+
when :article
|
40
|
+
CommonExtractors::ARTICLE_EXTRACTOR
|
41
|
+
when :canola
|
42
|
+
CommonExtractors::CANOLA_EXTRACTOR
|
43
|
+
when :default
|
44
|
+
CommonExtractors::DEFAULT_EXTRACTOR
|
45
|
+
when :keep_everything
|
46
|
+
CommonExtractors::KEEP_EVERYTHING_EXTRACTOR
|
47
|
+
when :largest_content
|
48
|
+
CommonExtractors::LARGEST_CONTENT_EXTRACTOR
|
49
|
+
else
|
50
|
+
raise "Invalid extractor: #{extractor_type}"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Allows post-extraction processing of the text.
|
55
|
+
# Useful for removing bits that the extractor accidentally included.
|
56
|
+
def modify(&block)
|
57
|
+
@modify = block
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def content
|
63
|
+
@content ||= open(@url).read
|
64
|
+
end
|
65
|
+
|
66
|
+
# Download the images to a local folder so that they will appear in the ebook.
|
67
|
+
def localize_images(html)
|
68
|
+
noko = Nokogiri::HTML(html)
|
69
|
+
noko.xpath("//img/@src").each do |url|
|
70
|
+
filetype = url.value.split(".").last.split("?").first
|
71
|
+
filename = ""
|
72
|
+
loop do
|
73
|
+
filename = ".images-temp/" + SecureRandom.hex + "." + filetype
|
74
|
+
break unless File.exists?(filename)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Convert all relative paths to absolute ones so that we can download the images.
|
78
|
+
absolute_url = URI.join(@url, url.value).to_s
|
79
|
+
|
80
|
+
begin
|
81
|
+
url.value = filename
|
82
|
+
File.open(filename, "wb") do |f|
|
83
|
+
img = open(absolute_url).read
|
84
|
+
f.write(img)
|
85
|
+
end
|
86
|
+
rescue Exception => e
|
87
|
+
puts "Failed to load #{absolute_url} (#{e})"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
noko.to_html
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
data/lib/elibrum.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
raise "Elibrum only works on JRuby at the moment." unless RUBY_PLATFORM =~ /java/
|
2
|
+
|
3
|
+
require "open-uri"
|
4
|
+
require "uri"
|
5
|
+
require "FileUtils"
|
6
|
+
require "SecureRandom"
|
7
|
+
require "erubis"
|
8
|
+
require "nokogiri"
|
9
|
+
require "java"
|
10
|
+
require "elibrum/jars/boilerpipe-1.2.0.jar"
|
11
|
+
require "elibrum/jars/nekohtml-1.9.13.jar"
|
12
|
+
require "elibrum/jars/xerces-2.9.1.jar"
|
13
|
+
|
14
|
+
java_import "de.l3s.boilerpipe.extractors.CommonExtractors"
|
15
|
+
java_import "de.l3s.boilerpipe.sax.HTMLHighlighter"
|
16
|
+
java_import "de.l3s.boilerpipe.sax.HTMLDocument"
|
17
|
+
java_import java.net.URL
|
18
|
+
|
19
|
+
require "elibrum/builder"
|
20
|
+
require "elibrum/book"
|
21
|
+
require "elibrum/webpage"
|
metadata
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: elibrum
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- Dave Sescleifer
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-01-28 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
version_requirements: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ~>
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: 1.5.6
|
21
|
+
none: false
|
22
|
+
requirement: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.5.6
|
27
|
+
none: false
|
28
|
+
prerelease: false
|
29
|
+
type: :runtime
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: erubis
|
32
|
+
version_requirements: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - ~>
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: 2.7.0
|
37
|
+
none: false
|
38
|
+
requirement: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ~>
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 2.7.0
|
43
|
+
none: false
|
44
|
+
prerelease: false
|
45
|
+
type: :runtime
|
46
|
+
description: Converts webpages into ebooks.
|
47
|
+
email: dave@sescleifer.com
|
48
|
+
executables: []
|
49
|
+
extensions: []
|
50
|
+
extra_rdoc_files: []
|
51
|
+
files:
|
52
|
+
- Gemfile
|
53
|
+
- README.md
|
54
|
+
- elibrum.gemspec
|
55
|
+
- lib/elibrum.rb
|
56
|
+
- lib/elibrum/book.rb
|
57
|
+
- lib/elibrum/builder.rb
|
58
|
+
- lib/elibrum/default.erb
|
59
|
+
- lib/elibrum/jars/LICENSE.txt
|
60
|
+
- lib/elibrum/jars/NOTICE.txt
|
61
|
+
- lib/elibrum/jars/boilerpipe-1.2.0.jar
|
62
|
+
- lib/elibrum/jars/nekohtml-1.9.13.jar
|
63
|
+
- lib/elibrum/jars/xerces-2.9.1.jar
|
64
|
+
- lib/elibrum/version.rb
|
65
|
+
- lib/elibrum/webpage.rb
|
66
|
+
homepage: http://github.com/dsesclei/elibrum
|
67
|
+
licenses: []
|
68
|
+
post_install_message:
|
69
|
+
rdoc_options: []
|
70
|
+
require_paths:
|
71
|
+
- lib
|
72
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
none: false
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
none: false
|
84
|
+
requirements: []
|
85
|
+
rubyforge_project:
|
86
|
+
rubygems_version: 1.8.24
|
87
|
+
signing_key:
|
88
|
+
specification_version: 3
|
89
|
+
summary: Converts webpages into ebooks
|
90
|
+
test_files: []
|
91
|
+
...
|