web2text 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/web2text +10 -0
- data/lib/web2text.rb +123 -0
- data/lib/web2text/crawl.rb +45 -0
- data/lib/web2text/crawler.rb +19 -0
- data/lib/web2text/formatters.rb +50 -0
- data/lib/web2text/version.rb +3 -0
- data/spec/crawl_spec.rb +45 -0
- data/spec/crawler_spec.rb +39 -0
- data/spec/formatters_spec.rb +81 -0
- data/spec/spec_helper.rb +27 -0
- data/spec/web2text_spec.rb +30 -0
- metadata +143 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6e1c4b836214eee9b4901d606ebf4210744e4fae
|
4
|
+
data.tar.gz: c045333d81e7e5e4a5d2fac1d1b39d9d9ab0194e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ae8c3350043a9a9213b6ec63429238d582736ecb48b934e0a51329dc69e4fad775c5f5c764628fb6ff5367f9bc8d1ebec4e867e0b12081e1cd98724b100fd56f
|
7
|
+
data.tar.gz: 93853ba2c589dda156578037eaaf76fe9647ba2c8b468ddbe12ce2a995d92b28e3e05b11650efe3f1732b8fe43f5f7f78885aa4a62dffda92f71743f3ac64d11
|
data/bin/web2text
ADDED
data/lib/web2text.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
|
7
|
+
module Web2Text
|
8
|
+
class Error < RuntimeError
|
9
|
+
def initialize(msg)
|
10
|
+
super msg
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class CommandError < Error
|
15
|
+
def initialize(msg)
|
16
|
+
super msg
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.parse_cli(args)
|
21
|
+
options = {
|
22
|
+
query: "body",
|
23
|
+
sleep: 0.0,
|
24
|
+
avoid: [],
|
25
|
+
focus: [],
|
26
|
+
formatter: LinePrinter,
|
27
|
+
ignore_robots_txt: false,
|
28
|
+
out: $stdout,
|
29
|
+
}
|
30
|
+
|
31
|
+
args = args.clone
|
32
|
+
|
33
|
+
OptionParser.new do |opts|
|
34
|
+
opts.banner = "Usage: web2text [options] http://example.com/"
|
35
|
+
|
36
|
+
opts.on("-q", "--css", "--query=CSS_QUERY", String) do |q|
|
37
|
+
options[:query] = q
|
38
|
+
end
|
39
|
+
|
40
|
+
opts.on("-s [OPTIONAL]", "--sleep [OPTIONAL]", Float, "Delay between requests. Default 1, -s sets to 1.") do |n|
|
41
|
+
options[:sleep] = n || 1.0
|
42
|
+
end
|
43
|
+
|
44
|
+
opts.on("--avoid x,y,z", Array, "List of paths to avoid when crawling. These paths and everything below them will be ignored.") do |avoid|
|
45
|
+
options[:avoid] = avoid
|
46
|
+
end
|
47
|
+
|
48
|
+
opts.on("--focus x,y,z", Array, "List of paths to process when crawling. Only these paths and pages below them will be processed") do |focus|
|
49
|
+
options[:focus] = focus
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
opts.on("--lines [web2.txt]", String, "One line per page. Can print to std out or a file.") do |f|
|
54
|
+
options[:formatter] = LinePrinter
|
55
|
+
options[:out] = if f then File.open(f, 'w') else $stdout end
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.on("--files out/", String, "One file per page. Following website structure, in the specified directory.") do |o|
|
59
|
+
options[:formatter] = FilePrinter
|
60
|
+
options[:out] = Pathname(o)
|
61
|
+
|
62
|
+
if options[:out].exist? and !options[:out].directory? then
|
63
|
+
raise Web2Text::CommandError.new 'argument to --files must be a directory'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
opts.on("--bad-robot", "Ignore robots.txt") do
|
68
|
+
options[:ignore_robots_txt] = true
|
69
|
+
end
|
70
|
+
|
71
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
72
|
+
puts opts
|
73
|
+
exit
|
74
|
+
end
|
75
|
+
end.parse! args
|
76
|
+
|
77
|
+
if args.length != 1 then
|
78
|
+
raise Web2Text::CommandError.new 'incorrect number of arguments!'
|
79
|
+
end
|
80
|
+
|
81
|
+
options[:url] = args[0]
|
82
|
+
options
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.do_crawl(options)
|
86
|
+
crawl = Crawl.new options[:url], options[:avoid], options[:focus]
|
87
|
+
crawler = Crawler.new crawl, options[:query]
|
88
|
+
formatter = options[:formatter].new crawl, options[:out]
|
89
|
+
|
90
|
+
Anemone.crawl(crawl.url, :obey_robots_txt => !options[:ignore_robots_txt]) do |anemone|
|
91
|
+
anemone.focus_crawl do |page|
|
92
|
+
crawl.filter page.links
|
93
|
+
end
|
94
|
+
|
95
|
+
anemone.on_every_page do |page|
|
96
|
+
STDERR.puts page.url
|
97
|
+
|
98
|
+
# ignore redirects
|
99
|
+
code = page.code || 200
|
100
|
+
if 300 <= code and code < 400
|
101
|
+
next
|
102
|
+
elsif !crawl.focus? page.url
|
103
|
+
next
|
104
|
+
elsif page.doc.nil?
|
105
|
+
STDERR.puts "ERR: Failed to retrieve #{page.url}"
|
106
|
+
next
|
107
|
+
end
|
108
|
+
|
109
|
+
plain = crawler.doc_as_plaintext page.doc
|
110
|
+
formatter.append plain, page.url
|
111
|
+
sleep options[:sleep]
|
112
|
+
end
|
113
|
+
|
114
|
+
anemone.after_crawl do
|
115
|
+
formatter.close
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
require 'web2text/crawl'
|
122
|
+
require 'web2text/crawler'
|
123
|
+
require 'web2text/formatters'
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
class Web2Text::Crawl
|
4
|
+
attr_reader :url
|
5
|
+
|
6
|
+
def initialize(url, avoid = [], focus = [])
|
7
|
+
@url = url
|
8
|
+
|
9
|
+
@avoid = avoid.map { |a|
|
10
|
+
a = URI::join(url, a) if !a.start_with? url
|
11
|
+
a.to_s
|
12
|
+
}
|
13
|
+
|
14
|
+
@focus = focus.map { |a|
|
15
|
+
a = URI::join(url, a) if !a.start_with? url
|
16
|
+
a.to_s
|
17
|
+
}
|
18
|
+
end
|
19
|
+
|
20
|
+
def filter(urls)
|
21
|
+
urls.reject {|u| self.skip? u}
|
22
|
+
end
|
23
|
+
|
24
|
+
def skip?(url)
|
25
|
+
url_s = url.to_s
|
26
|
+
if !url_s.start_with? @url
|
27
|
+
return true
|
28
|
+
end
|
29
|
+
|
30
|
+
@avoid.any? { |a|
|
31
|
+
url_s.start_with? a
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def focus?(url)
|
36
|
+
if @focus.empty?
|
37
|
+
true
|
38
|
+
else
|
39
|
+
url_s = url.to_s
|
40
|
+
@focus.any? { |a|
|
41
|
+
url_s.start_with? a
|
42
|
+
}
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
class Web2Text::Crawler
|
3
|
+
def initialize(crawl, query="body")
|
4
|
+
@crawl = crawl
|
5
|
+
@query = query
|
6
|
+
end
|
7
|
+
|
8
|
+
def doc_as_plaintext(doc)
|
9
|
+
# just using inner_text doesn't give us quite enough spaces :(
|
10
|
+
doc.css(@query).collect do |j|
|
11
|
+
bits = []
|
12
|
+
j.traverse do |c|
|
13
|
+
if c.text? then bits.push c.content end
|
14
|
+
end
|
15
|
+
|
16
|
+
bits.join(' ')
|
17
|
+
end.join(' ')
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
class Web2Text::LinePrinter
|
4
|
+
def initialize(crawl, output)
|
5
|
+
@output = output
|
6
|
+
@first = true
|
7
|
+
end
|
8
|
+
|
9
|
+
def append(doc, uri)
|
10
|
+
if !@first then
|
11
|
+
@output.write "\n"
|
12
|
+
end
|
13
|
+
@first = false
|
14
|
+
|
15
|
+
@output.write doc.gsub(/\n+/, ' ')
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
def close
|
20
|
+
@output.close
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Writes one file per page
|
25
|
+
class Web2Text::FilePrinter
|
26
|
+
def initialize(crawl, out_dir)
|
27
|
+
root_path = URI(crawl.url).path.to_s
|
28
|
+
root_path = "/" if root_path.empty?
|
29
|
+
|
30
|
+
@crawl_root = Pathname(root_path)
|
31
|
+
@out_dir = Pathname(out_dir)
|
32
|
+
@out_dir.mkpath
|
33
|
+
end
|
34
|
+
|
35
|
+
def append(doc, uri)
|
36
|
+
path = @out_dir + Pathname(URI(uri).path).relative_path_from(@crawl_root)
|
37
|
+
if path.extname == "" then
|
38
|
+
path = path + 'index.txt'
|
39
|
+
end
|
40
|
+
|
41
|
+
path = path.sub_ext('.txt')
|
42
|
+
|
43
|
+
path.parent.mkpath
|
44
|
+
path.open("w") { |f| f.write(doc) }
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
def close
|
49
|
+
end
|
50
|
+
end
|
data/spec/crawl_spec.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'web2text'
|
2
|
+
|
3
|
+
root = "http://example.com"
|
4
|
+
|
5
|
+
RSpec.describe Crawl, '#filter' do
|
6
|
+
context "with no patterns" do
|
7
|
+
it "returns all links" do
|
8
|
+
crawl = Crawl.new root
|
9
|
+
links = ["#{root}/wow", "#{root}/neat"]
|
10
|
+
expect(crawl.filter links).to eq links
|
11
|
+
expect(links.select {|u| crawl.focus? u}).to eq links
|
12
|
+
end
|
13
|
+
|
14
|
+
it "will not crawl above the root" do
|
15
|
+
crawl = Crawl.new "#{root}/wow/cool"
|
16
|
+
expect(crawl.skip? root).to be true
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
context "with patterns" do
|
21
|
+
it "can filter out whole directories" do
|
22
|
+
good = ["#{root}/wow", "#{root}/neat"]
|
23
|
+
bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
|
24
|
+
|
25
|
+
crawl = Crawl.new root, ["#{root}/avoid"]
|
26
|
+
expect(crawl.filter good + bad).to eq good
|
27
|
+
end
|
28
|
+
|
29
|
+
it "can focus on pages" do
|
30
|
+
bad = ["#{root}/avoid", "#{root}/avoid"]
|
31
|
+
good = ["#{root}/focus", "#{root}/focus/index.html", "#{root}/focus/this/nested/stuff"]
|
32
|
+
|
33
|
+
crawl = Crawl.new root, [], ["#{root}/focus"]
|
34
|
+
expect((good + bad).select {|u| crawl.focus? u}).to eq good
|
35
|
+
end
|
36
|
+
|
37
|
+
it "can skip host name parts to filter out directories" do
|
38
|
+
good = ["#{root}/wow", "#{root}/neat"]
|
39
|
+
bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
|
40
|
+
|
41
|
+
crawl = Crawl.new root, ["/avoid"]
|
42
|
+
expect(crawl.filter good + bad).to eq good
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
require 'web2text'
|
4
|
+
|
5
|
+
RSpec.describe Crawler, '#process_doc' do
|
6
|
+
before(:all) do
|
7
|
+
@root = "http://example.com"
|
8
|
+
@crawl = Crawl.new @root
|
9
|
+
|
10
|
+
@h1_content = "This is a document"
|
11
|
+
@p_content = "good stuff!"
|
12
|
+
|
13
|
+
@example_html = Nokogiri::HTML "<!doctype html><html><head></head><body><h1>#{@h1_content}</h1><p>#{@p_content}</p></body></html>"
|
14
|
+
end
|
15
|
+
|
16
|
+
before(:each) do
|
17
|
+
@crawler = Crawler.new @crawl
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'can consider a page and make output' do
|
21
|
+
out = @crawler.doc_as_plaintext @example_html
|
22
|
+
expect(out).to eq "#{@h1_content} #{@p_content}"
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'can limit the output by using css queries' do
|
26
|
+
tests = [
|
27
|
+
["p", @p_content],
|
28
|
+
["h1", @h1_content],
|
29
|
+
["p, h1", "#{@h1_content} #{@p_content}"],
|
30
|
+
["h1, p", "#{@h1_content} #{@p_content}"]
|
31
|
+
]
|
32
|
+
|
33
|
+
tests.each do |test|
|
34
|
+
@crawler = Crawler.new @crawl, test[0]
|
35
|
+
out = @crawler.doc_as_plaintext @example_html
|
36
|
+
expect(out).to eq(test[1]), "with css query '#{test[0]}', got '#{out}', but expected '#{test[1]}'"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
require 'stringio'
|
3
|
+
require 'test_construct'
|
4
|
+
|
5
|
+
require 'web2text'
|
6
|
+
|
7
|
+
LinePrinter = Web2Text::LinePrinter
|
8
|
+
FilePrinter = Web2Text::FilePrinter
|
9
|
+
|
10
|
+
doc1 = "This is a document\nwith a newline"
|
11
|
+
doc2 = "This is another document"
|
12
|
+
root = 'http://example.com/wow/'
|
13
|
+
|
14
|
+
RSpec.describe LinePrinter, '#append' do
|
15
|
+
it 'prints one line per document' do
|
16
|
+
crawl = Crawl.new root
|
17
|
+
result = StringIO::open do |out|
|
18
|
+
LinePrinter.new(crawl, out)
|
19
|
+
.append(doc1, "#{root}index.html")
|
20
|
+
.append(doc2, "#{root}/cool/index.html")
|
21
|
+
|
22
|
+
out.string
|
23
|
+
end
|
24
|
+
|
25
|
+
expect(result.lines.length).to eq(2)
|
26
|
+
expect(result.lines[1]).to eq(doc2)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
RSpec.describe FilePrinter, '#append' do
|
32
|
+
include TestConstruct::Helpers
|
33
|
+
|
34
|
+
it 'prints one file per document' do
|
35
|
+
crawl = Crawl.new root
|
36
|
+
folder = 'test_output/'
|
37
|
+
|
38
|
+
within_construct() do |construct|
|
39
|
+
construct.directory 'fileprinter_web2text' do |d|
|
40
|
+
FilePrinter.new(crawl, folder)
|
41
|
+
.append(doc1, "#{root}/")
|
42
|
+
.append(doc2, "#{root}/cool/index.html")
|
43
|
+
.append(doc1, "#{root}/no_slash")
|
44
|
+
|
45
|
+
doc1_path = File.join folder, 'index.txt'
|
46
|
+
expect(File::file?(doc1_path)).to be_truthy
|
47
|
+
expect(IO.read(doc1_path)).to eq(doc1)
|
48
|
+
|
49
|
+
doc2_path = File.join(folder, 'cool', 'index.txt')
|
50
|
+
expect(File.file?(doc2_path)).to be_truthy
|
51
|
+
expect(IO.read(doc2_path)).to eq(doc2)
|
52
|
+
|
53
|
+
doc3_path = File.join folder, 'no_slash', 'index.txt'
|
54
|
+
expect(File::file?(doc3_path)).to be_truthy
|
55
|
+
expect(IO.read(doc3_path)).to eq(doc1)
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
RSpec.describe FilePrinter do
|
63
|
+
include TestConstruct::Helpers
|
64
|
+
|
65
|
+
it "doesn't choke on roots with no path (eg. http://example.com)" do
|
66
|
+
tricky = "http://example.com"
|
67
|
+
crawl = Crawl.new tricky
|
68
|
+
folder = 'test_output/'
|
69
|
+
|
70
|
+
within_construct() do |construct|
|
71
|
+
construct.directory 'fileprinter_web2text' do |d|
|
72
|
+
FilePrinter.new(crawl, folder)
|
73
|
+
.append(doc1, "#{tricky}/")
|
74
|
+
|
75
|
+
doc1_path = File.join folder, 'index.txt'
|
76
|
+
expect(File::file?(doc1_path)).to be_truthy
|
77
|
+
expect(IO.read(doc1_path)).to eq(doc1)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'rspec'
|
2
|
+
|
3
|
+
require 'web2text'
|
4
|
+
|
5
|
+
Crawler = Web2Text::Crawler
|
6
|
+
Crawl = Web2Text::Crawl
|
7
|
+
|
8
|
+
|
9
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
10
|
+
RSpec.configure do |config|
|
11
|
+
config.expect_with :rspec do |expectations|
|
12
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
13
|
+
end
|
14
|
+
|
15
|
+
config.mock_with :rspec do |mocks|
|
16
|
+
mocks.verify_partial_doubles = true
|
17
|
+
end
|
18
|
+
|
19
|
+
config.warnings = true
|
20
|
+
config.order = :random
|
21
|
+
|
22
|
+
# Seed global randomization in this process using the `--seed` CLI option.
|
23
|
+
# Setting this allows you to use `--seed` to deterministically reproduce
|
24
|
+
# test failures related to randomization by passing the same `--seed` value
|
25
|
+
# as the one that triggered the failure.
|
26
|
+
Kernel.srand config.seed
|
27
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'web2text'
|
2
|
+
require 'web2text/version'
|
3
|
+
require 'shellwords'
|
4
|
+
|
5
|
+
ROOT = "http://example.com"
|
6
|
+
|
7
|
+
def parse(args)
|
8
|
+
Web2Text.parse_cli "#{args} #{ROOT}".shellsplit
|
9
|
+
end
|
10
|
+
|
11
|
+
RSpec.describe Web2Text do
|
12
|
+
it 'has a semver VERSION' do
|
13
|
+
expect(Web2Text::VERSION =~ /\d+\.\d+\.\d+/).to be 0
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
RSpec.describe Web2Text, '#parse_cli' do
|
18
|
+
it 'defaults to 0 sleep' do
|
19
|
+
expect(parse('')[:sleep]).to be 0.0
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'sleeps for 1s with -s' do
|
23
|
+
expect(parse('-s')[:sleep]).to be 1.0
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'can specify sleep with -s N or --sleep N' do
|
27
|
+
expect(parse('-s 2')[:sleep]).to be 2.0
|
28
|
+
expect(parse('--sleep 2')[:sleep]).to be 2.0
|
29
|
+
end
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: web2text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Alex Wilson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-07-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: anemone
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0.7'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.6.6
|
34
|
+
- - ">="
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: 1.6.6.2
|
37
|
+
type: :runtime
|
38
|
+
prerelease: false
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - "~>"
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.6.6
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 1.6.6.2
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rspec
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - "~>"
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '3.3'
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: 3.3.0
|
57
|
+
type: :development
|
58
|
+
prerelease: false
|
59
|
+
version_requirements: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - "~>"
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '3.3'
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 3.3.0
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
name: rake
|
69
|
+
requirement: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - "~>"
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: '10.4'
|
74
|
+
- - ">="
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: 10.4.2
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - "~>"
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '10.4'
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: 10.4.2
|
87
|
+
- !ruby/object:Gem::Dependency
|
88
|
+
name: test_construct
|
89
|
+
requirement: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - "~>"
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '2.0'
|
94
|
+
type: :development
|
95
|
+
prerelease: false
|
96
|
+
version_requirements: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - "~>"
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '2.0'
|
101
|
+
description:
|
102
|
+
email:
|
103
|
+
executables:
|
104
|
+
- web2text
|
105
|
+
extensions: []
|
106
|
+
extra_rdoc_files: []
|
107
|
+
files:
|
108
|
+
- bin/web2text
|
109
|
+
- lib/web2text.rb
|
110
|
+
- lib/web2text/crawl.rb
|
111
|
+
- lib/web2text/crawler.rb
|
112
|
+
- lib/web2text/formatters.rb
|
113
|
+
- lib/web2text/version.rb
|
114
|
+
- spec/crawl_spec.rb
|
115
|
+
- spec/crawler_spec.rb
|
116
|
+
- spec/formatters_spec.rb
|
117
|
+
- spec/spec_helper.rb
|
118
|
+
- spec/web2text_spec.rb
|
119
|
+
homepage: https://github.com/yourpalal/web2text
|
120
|
+
licenses:
|
121
|
+
- MIT
|
122
|
+
metadata: {}
|
123
|
+
post_install_message:
|
124
|
+
rdoc_options: []
|
125
|
+
require_paths:
|
126
|
+
- lib
|
127
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
133
|
+
requirements:
|
134
|
+
- - ">="
|
135
|
+
- !ruby/object:Gem::Version
|
136
|
+
version: '0'
|
137
|
+
requirements: []
|
138
|
+
rubyforge_project:
|
139
|
+
rubygems_version: 2.2.2
|
140
|
+
signing_key:
|
141
|
+
specification_version: 4
|
142
|
+
summary: Scrape a website as plain text.
|
143
|
+
test_files: []
|