elibrum 0.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/README.md +64 -0
- data/elibrum.gemspec +20 -0
- data/lib/elibrum/book.rb +20 -0
- data/lib/elibrum/builder.rb +40 -0
- data/lib/elibrum/default.erb +3 -0
- data/lib/elibrum/jars/LICENSE.txt +18 -0
- data/lib/elibrum/jars/NOTICE.txt +24 -0
- data/lib/elibrum/jars/boilerpipe-1.2.0.jar +0 -0
- data/lib/elibrum/jars/nekohtml-1.9.13.jar +0 -0
- data/lib/elibrum/jars/xerces-2.9.1.jar +0 -0
- data/lib/elibrum/version.rb +3 -0
- data/lib/elibrum/webpage.rb +94 -0
- data/lib/elibrum.rb +21 -0
- metadata +91 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
Elibrum
|
2
|
+
=======
|
3
|
+
|
4
|
+
Elibrum converts webpages into ebooks. It extracts the text with Boilerpipe and builds the ebook with `ebook-convert`.
|
5
|
+
|
6
|
+
JRuby 1.9.2 or greater is required.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Install [Calibre](http://calibre-ebook.com/) to get `ebook-convert`.
|
11
|
+
|
12
|
+
```
|
13
|
+
$ rvm install jruby
|
14
|
+
$ rvm use jruby
|
15
|
+
$ export JRUBY_OPTS=--1.9
|
16
|
+
$ gem install elibrum
|
17
|
+
```
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
```
|
22
|
+
require "elibrum"
|
23
|
+
|
24
|
+
# Must be set. Usually in the following location on OS X:
|
25
|
+
Elibrum::EBOOK_CONVERT_PATH = "/Applications/calibre.app/Contents/MacOS/ebook-convert"
|
26
|
+
|
27
|
+
Elibrum::Builder.new("git_tutorials", :epub) do
|
28
|
+
add "http://gitready.com/intermediate/2009/01/31/intro-to-rebase.html"
|
29
|
+
add "http://gitready.com/intermediate/2009/02/13/list-remote-branches.html"
|
30
|
+
add "http://gitready.com/advanced/2009/07/31/tig-the-ncurses-front-end-to-git.html"
|
31
|
+
end
|
32
|
+
|
33
|
+
links = []
|
34
|
+
links << "http://techcrunch.com/2013/01/25/eu-enlists-telefonica-cisco-hp-nokia-arm-and-others-to-close-the-700k-it-job-gap-in-europe/"
|
35
|
+
links << "http://techcrunch.com/2013/01/25/h265-is-approved/"
|
36
|
+
links << "http://techcrunch.com/2013/01/24/nokia-confirms-the-pure-view-was-officially-the-last-symbian-phone/"
|
37
|
+
|
38
|
+
Elibrum::Builder.new("blog_posts", [:epub, :mobi, :pdf]) do |b|
|
39
|
+
b.title = "Some Blog Posts"
|
40
|
+
b.author = "TechCrunch"
|
41
|
+
|
42
|
+
b.add *links do |a|
|
43
|
+
a.extractor = :largest_content
|
44
|
+
a.modify do |title, text|
|
45
|
+
title = title.split(" | ").first.strip
|
46
|
+
text = text.split("Comments").last
|
47
|
+
[title, text]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Frontpage of Hacker News as an ebook!
|
53
|
+
require "ruby-hackernews"
|
54
|
+
Elibrum::Builder.new("frontpage", [:epub, :pdf]) do
|
55
|
+
links = RubyHackernews::Entry.all.map{|e| e.link.href}.reject{|l| l[0...4] != "http"}
|
56
|
+
add *links
|
57
|
+
end
|
58
|
+
```
|
59
|
+
|
60
|
+
## TODO
|
61
|
+
|
62
|
+
1. Add tests
|
63
|
+
1. Modify Boilerpipe to send user agent and accept a string as input (see Webpage#text)
|
64
|
+
1. Use [gae-boilerpipe](https://github.com/gregbayer/gae-boilerpipe) to make the project pure Ruby
|
data/elibrum.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require "elibrum/version"
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "elibrum"
|
6
|
+
s.version = Elibrum::VERSION
|
7
|
+
s.authors = ["Dave Sescleifer"]
|
8
|
+
s.summary = "Converts webpages into ebooks"
|
9
|
+
s.description = "Converts webpages into ebooks."
|
10
|
+
s.email = "dave@sescleifer.com"
|
11
|
+
s.platform = "java"
|
12
|
+
s.homepage = "http://github.com/dsesclei/elibrum"
|
13
|
+
s.require_paths = ["lib"]
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
16
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
17
|
+
|
18
|
+
s.add_runtime_dependency "nokogiri", ["~> 1.5.6"]
|
19
|
+
s.add_runtime_dependency "erubis", ["~> 2.7.0"]
|
20
|
+
end
|
data/lib/elibrum/book.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
module Elibrum
|
2
|
+
class Book
|
3
|
+
attr_accessor :title, :author
|
4
|
+
attr_reader :pages
|
5
|
+
|
6
|
+
def initialize(filename, &block)
|
7
|
+
@title = filename.split(/_|-/).map(&:capitalize).join(" ")
|
8
|
+
@author = "Unknown"
|
9
|
+
@pages = []
|
10
|
+
|
11
|
+
block.arity < 1 ? instance_eval(&block) : block.call(self)
|
12
|
+
end
|
13
|
+
|
14
|
+
def add(*links, &block)
|
15
|
+
links.each do |url|
|
16
|
+
@pages << Webpage.new(url, &block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Elibrum
|
2
|
+
class Builder
|
3
|
+
def initialize(filename, formats, template=default_template, &block)
|
4
|
+
raise "Elibrum::EBOOK_CONVERT_PATH is not defined. Put the path to ebook-convert in this constant." unless defined?(EBOOK_CONVERT_PATH)
|
5
|
+
|
6
|
+
FileUtils.mkdir(".images-temp") unless File.directory?(".images-temp")
|
7
|
+
@book = Book.new(filename, &block)
|
8
|
+
@template = template
|
9
|
+
|
10
|
+
[*formats].each do |format|
|
11
|
+
build(filename, format)
|
12
|
+
end
|
13
|
+
|
14
|
+
FileUtils.rm_rf(".images-temp")
|
15
|
+
end
|
16
|
+
|
17
|
+
def build(filename, format)
|
18
|
+
# Create an HTML file to use as the source file for conversion
|
19
|
+
File.open(".#{filename}-temp.html", "w+") {|f| f.write(html)}
|
20
|
+
# Documentation for ebook-convert can be found here: http://manual.calibre-ebook.com/cli/ebook-convert.html
|
21
|
+
`#{EBOOK_CONVERT_PATH} .#{filename}-temp.html #{filename}.#{format} --title "#{@book.title}" --authors "#{@book.author}" --chapter "//h:pagebreak"`
|
22
|
+
#FileUtils.rm(filename + "-temp.html")
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def html
|
28
|
+
@html ||= begin
|
29
|
+
template_source = File.read(@template)
|
30
|
+
html = @book.pages.inject("") do |ret, page|
|
31
|
+
ret += Erubis::Eruby.new(template_source).result(title: page.title, text: page.text, url: page.url)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def default_template
|
37
|
+
File.expand_path("default.erb", File.dirname(__FILE__))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
|
2
|
+
boilerpipe
|
3
|
+
|
4
|
+
Copyright (c) 2009-2011 Christian Kohlschütter
|
5
|
+
|
6
|
+
The author licenses this file to You under the Apache License, Version 2.0
|
7
|
+
(the "License"); you may not use this file except in compliance with
|
8
|
+
the License. You may obtain a copy of the License at
|
9
|
+
|
10
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
|
12
|
+
Unless required by applicable law or agreed to in writing, software
|
13
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
See the License for the specific language governing permissions and
|
16
|
+
limitations under the License.
|
17
|
+
|
18
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
|
2
|
+
boilerpipe
|
3
|
+
|
4
|
+
Copyright (c) 2009-2011 Christian Kohlschütter
|
5
|
+
|
6
|
+
The author licenses this file to You under the Apache License, Version 2.0
|
7
|
+
(the "License"); you may not use this file except in compliance with
|
8
|
+
the License. You may obtain a copy of the License at
|
9
|
+
|
10
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
|
12
|
+
Unless required by applicable law or agreed to in writing, software
|
13
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
See the License for the specific language governing permissions and
|
16
|
+
limitations under the License.
|
17
|
+
|
18
|
+
|
19
|
+
This software contains the following parts which are also provided
|
20
|
+
under the Apache License 2.0 (http://apache.org/licenses/LICENSE-2.0.txt):
|
21
|
+
|
22
|
+
- NekoHTML
|
23
|
+
- Xerces
|
24
|
+
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,94 @@
|
|
1
|
+
module Elibrum
|
2
|
+
class Webpage
|
3
|
+
attr_reader :url, :extractor
|
4
|
+
|
5
|
+
def initialize(url, &block)
|
6
|
+
# Delete the trailing / so that URI.join in localize_images doesn't confuse this URL for a directory
|
7
|
+
@url = url[-1] == "/" ? url[0...-1] : url
|
8
|
+
@modify = proc {|title, text| [title, text]}
|
9
|
+
@extractor = CommonExtractors::ARTICLE_EXTRACTOR
|
10
|
+
|
11
|
+
if block
|
12
|
+
block.arity < 1 ? instance_eval(&block) : block.call(self)
|
13
|
+
end
|
14
|
+
|
15
|
+
@title, @text = @modify.call(title, text)
|
16
|
+
@text = localize_images(@text)
|
17
|
+
end
|
18
|
+
|
19
|
+
def title
|
20
|
+
@title ||= Nokogiri::HTML(content).xpath("//title").text
|
21
|
+
end
|
22
|
+
|
23
|
+
def text
|
24
|
+
# The page is loaded both here and in Webpage#content.
|
25
|
+
# TODO: Modify process() to accept a string as input.
|
26
|
+
@text ||= begin
|
27
|
+
highlighter = HTMLHighlighter.newExtractingInstance(true, false)
|
28
|
+
highlighter.process(URL.new(@url), @extractor)
|
29
|
+
rescue Exception => e
|
30
|
+
# Boilerpipe does not pass along a user agent when retrieving sites, so some return a 403
|
31
|
+
msg = "Failed to load #{@url} (#{e})"
|
32
|
+
puts msg
|
33
|
+
msg
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def extractor=(extractor_type)
|
38
|
+
@extractor = case extractor_type
|
39
|
+
when :article
|
40
|
+
CommonExtractors::ARTICLE_EXTRACTOR
|
41
|
+
when :canola
|
42
|
+
CommonExtractors::CANOLA_EXTRACTOR
|
43
|
+
when :default
|
44
|
+
CommonExtractors::DEFAULT_EXTRACTOR
|
45
|
+
when :keep_everything
|
46
|
+
CommonExtractors::KEEP_EVERYTHING_EXTRACTOR
|
47
|
+
when :largest_content
|
48
|
+
CommonExtractors::LARGEST_CONTENT_EXTRACTOR
|
49
|
+
else
|
50
|
+
raise "Invalid extractor: #{extractor_type}"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Allows post-extraction processing of the text.
|
55
|
+
# Useful for removing bits that the extractor accidentally included.
|
56
|
+
def modify(&block)
|
57
|
+
@modify = block
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def content
|
63
|
+
@content ||= open(@url).read
|
64
|
+
end
|
65
|
+
|
66
|
+
# Download the images to a local folder so that they will appear in the ebook.
|
67
|
+
def localize_images(html)
|
68
|
+
noko = Nokogiri::HTML(html)
|
69
|
+
noko.xpath("//img/@src").each do |url|
|
70
|
+
filetype = url.value.split(".").last.split("?").first
|
71
|
+
filename = ""
|
72
|
+
loop do
|
73
|
+
filename = ".images-temp/" + SecureRandom.hex + "." + filetype
|
74
|
+
break unless File.exists?(filename)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Convert all relative paths to absolute ones so that we can download the images.
|
78
|
+
absolute_url = URI.join(@url, url.value).to_s
|
79
|
+
|
80
|
+
begin
|
81
|
+
url.value = filename
|
82
|
+
File.open(filename, "wb") do |f|
|
83
|
+
img = open(absolute_url).read
|
84
|
+
f.write(img)
|
85
|
+
end
|
86
|
+
rescue Exception => e
|
87
|
+
puts "Failed to load #{absolute_url} (#{e})"
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
noko.to_html
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
data/lib/elibrum.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
raise "Elibrum only works on JRuby at the moment." unless RUBY_PLATFORM =~ /java/
|
2
|
+
|
3
|
+
require "open-uri"
|
4
|
+
require "uri"
|
5
|
+
require "FileUtils"
|
6
|
+
require "SecureRandom"
|
7
|
+
require "erubis"
|
8
|
+
require "nokogiri"
|
9
|
+
require "java"
|
10
|
+
require "elibrum/jars/boilerpipe-1.2.0.jar"
|
11
|
+
require "elibrum/jars/nekohtml-1.9.13.jar"
|
12
|
+
require "elibrum/jars/xerces-2.9.1.jar"
|
13
|
+
|
14
|
+
java_import "de.l3s.boilerpipe.extractors.CommonExtractors"
|
15
|
+
java_import "de.l3s.boilerpipe.sax.HTMLHighlighter"
|
16
|
+
java_import "de.l3s.boilerpipe.sax.HTMLDocument"
|
17
|
+
java_import java.net.URL
|
18
|
+
|
19
|
+
require "elibrum/builder"
|
20
|
+
require "elibrum/book"
|
21
|
+
require "elibrum/webpage"
|
metadata
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: elibrum
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
6
|
+
platform: java
|
7
|
+
authors:
|
8
|
+
- Dave Sescleifer
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-01-28 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
version_requirements: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ~>
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: 1.5.6
|
21
|
+
none: false
|
22
|
+
requirement: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.5.6
|
27
|
+
none: false
|
28
|
+
prerelease: false
|
29
|
+
type: :runtime
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: erubis
|
32
|
+
version_requirements: !ruby/object:Gem::Requirement
|
33
|
+
requirements:
|
34
|
+
- - ~>
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: 2.7.0
|
37
|
+
none: false
|
38
|
+
requirement: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ~>
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 2.7.0
|
43
|
+
none: false
|
44
|
+
prerelease: false
|
45
|
+
type: :runtime
|
46
|
+
description: Converts webpages into ebooks.
|
47
|
+
email: dave@sescleifer.com
|
48
|
+
executables: []
|
49
|
+
extensions: []
|
50
|
+
extra_rdoc_files: []
|
51
|
+
files:
|
52
|
+
- Gemfile
|
53
|
+
- README.md
|
54
|
+
- elibrum.gemspec
|
55
|
+
- lib/elibrum.rb
|
56
|
+
- lib/elibrum/book.rb
|
57
|
+
- lib/elibrum/builder.rb
|
58
|
+
- lib/elibrum/default.erb
|
59
|
+
- lib/elibrum/jars/LICENSE.txt
|
60
|
+
- lib/elibrum/jars/NOTICE.txt
|
61
|
+
- lib/elibrum/jars/boilerpipe-1.2.0.jar
|
62
|
+
- lib/elibrum/jars/nekohtml-1.9.13.jar
|
63
|
+
- lib/elibrum/jars/xerces-2.9.1.jar
|
64
|
+
- lib/elibrum/version.rb
|
65
|
+
- lib/elibrum/webpage.rb
|
66
|
+
homepage: http://github.com/dsesclei/elibrum
|
67
|
+
licenses: []
|
68
|
+
post_install_message:
|
69
|
+
rdoc_options: []
|
70
|
+
require_paths:
|
71
|
+
- lib
|
72
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
none: false
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
none: false
|
84
|
+
requirements: []
|
85
|
+
rubyforge_project:
|
86
|
+
rubygems_version: 1.8.24
|
87
|
+
signing_key:
|
88
|
+
specification_version: 3
|
89
|
+
summary: Converts webpages into ebooks
|
90
|
+
test_files: []
|
91
|
+
...
|