web2text 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6e1c4b836214eee9b4901d606ebf4210744e4fae
4
+ data.tar.gz: c045333d81e7e5e4a5d2fac1d1b39d9d9ab0194e
5
+ SHA512:
6
+ metadata.gz: ae8c3350043a9a9213b6ec63429238d582736ecb48b934e0a51329dc69e4fad775c5f5c764628fb6ff5367f9bc8d1ebec4e867e0b12081e1cd98724b100fd56f
7
+ data.tar.gz: 93853ba2c589dda156578037eaaf76fe9647ba2c8b468ddbe12ce2a995d92b28e3e05b11650efe3f1732b8fe43f5f7f78885aa4a62dffda92f71743f3ac64d11
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'web2text'
4
+
5
+ begin
6
+ options = Web2Text::parse_cli ARGV
7
+ Web2Text::do_crawl options
8
+ rescue Web2Text::Error => e
9
+ puts "#{e.to_s} Try -h for help"
10
+ end
@@ -0,0 +1,123 @@
1
+ require 'anemone'
2
+ require 'nokogiri'
3
+
4
+ require 'optparse'
5
+
6
+
7
+ module Web2Text
8
+ class Error < RuntimeError
9
+ def initialize(msg)
10
+ super msg
11
+ end
12
+ end
13
+
14
+ class CommandError < Error
15
+ def initialize(msg)
16
+ super msg
17
+ end
18
+ end
19
+
20
+ def self.parse_cli(args)
21
+ options = {
22
+ query: "body",
23
+ sleep: 0.0,
24
+ avoid: [],
25
+ focus: [],
26
+ formatter: LinePrinter,
27
+ ignore_robots_txt: false,
28
+ out: $stdout,
29
+ }
30
+
31
+ args = args.clone
32
+
33
+ OptionParser.new do |opts|
34
+ opts.banner = "Usage: web2text [options] http://example.com/"
35
+
36
+ opts.on("-q", "--css", "--query=CSS_QUERY", String) do |q|
37
+ options[:query] = q
38
+ end
39
+
40
+ opts.on("-s [OPTIONAL]", "--sleep [OPTIONAL]", Float, "Delay between requests. Default 1, -s sets to 1.") do |n|
41
+ options[:sleep] = n || 1.0
42
+ end
43
+
44
+ opts.on("--avoid x,y,z", Array, "List of paths to avoid when crawling. These paths and everything below them will be ignored.") do |avoid|
45
+ options[:avoid] = avoid
46
+ end
47
+
48
+ opts.on("--focus x,y,z", Array, "List of paths to process when crawling. Only these paths and pages below them will be processed") do |focus|
49
+ options[:focus] = focus
50
+ end
51
+
52
+
53
+ opts.on("--lines [web2.txt]", String, "One line per page. Can print to std out or a file.") do |f|
54
+ options[:formatter] = LinePrinter
55
+ options[:out] = if f then File.open(f, 'w') else $stdout end
56
+ end
57
+
58
+ opts.on("--files out/", String, "One file per page. Following website structure, in the specified directory.") do |o|
59
+ options[:formatter] = FilePrinter
60
+ options[:out] = Pathname(o)
61
+
62
+ if options[:out].exist? and !options[:out].directory? then
63
+ raise Web2Text::CommandError.new 'argument to --files must be a directory'
64
+ end
65
+ end
66
+
67
+ opts.on("--bad-robot", "Ignore robots.txt") do
68
+ options[:ignore_robots_txt] = true
69
+ end
70
+
71
+ opts.on_tail("-h", "--help", "Show this message") do
72
+ puts opts
73
+ exit
74
+ end
75
+ end.parse! args
76
+
77
+ if args.length != 1 then
78
+ raise Web2Text::CommandError.new 'incorrect number of arguments!'
79
+ end
80
+
81
+ options[:url] = args[0]
82
+ options
83
+ end
84
+
85
+ def self.do_crawl(options)
86
+ crawl = Crawl.new options[:url], options[:avoid], options[:focus]
87
+ crawler = Crawler.new crawl, options[:query]
88
+ formatter = options[:formatter].new crawl, options[:out]
89
+
90
+ Anemone.crawl(crawl.url, :obey_robots_txt => !options[:ignore_robots_txt]) do |anemone|
91
+ anemone.focus_crawl do |page|
92
+ crawl.filter page.links
93
+ end
94
+
95
+ anemone.on_every_page do |page|
96
+ STDERR.puts page.url
97
+
98
+ # ignore redirects
99
+ code = page.code || 200
100
+ if 300 <= code and code < 400
101
+ next
102
+ elsif !crawl.focus? page.url
103
+ next
104
+ elsif page.doc.nil?
105
+ STDERR.puts "ERR: Failed to retrieve #{page.url}"
106
+ next
107
+ end
108
+
109
+ plain = crawler.doc_as_plaintext page.doc
110
+ formatter.append plain, page.url
111
+ sleep options[:sleep]
112
+ end
113
+
114
+ anemone.after_crawl do
115
+ formatter.close
116
+ end
117
+ end
118
+ end
119
+ end
120
+
121
+ require 'web2text/crawl'
122
+ require 'web2text/crawler'
123
+ require 'web2text/formatters'
@@ -0,0 +1,45 @@
1
+ require 'uri'
2
+
3
+ class Web2Text::Crawl
4
+ attr_reader :url
5
+
6
+ def initialize(url, avoid = [], focus = [])
7
+ @url = url
8
+
9
+ @avoid = avoid.map { |a|
10
+ a = URI::join(url, a) if !a.start_with? url
11
+ a.to_s
12
+ }
13
+
14
+ @focus = focus.map { |a|
15
+ a = URI::join(url, a) if !a.start_with? url
16
+ a.to_s
17
+ }
18
+ end
19
+
20
+ def filter(urls)
21
+ urls.reject {|u| self.skip? u}
22
+ end
23
+
24
+ def skip?(url)
25
+ url_s = url.to_s
26
+ if !url_s.start_with? @url
27
+ return true
28
+ end
29
+
30
+ @avoid.any? { |a|
31
+ url_s.start_with? a
32
+ }
33
+ end
34
+
35
+ def focus?(url)
36
+ if @focus.empty?
37
+ true
38
+ else
39
+ url_s = url.to_s
40
+ @focus.any? { |a|
41
+ url_s.start_with? a
42
+ }
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,19 @@
1
+
2
+ class Web2Text::Crawler
3
+ def initialize(crawl, query="body")
4
+ @crawl = crawl
5
+ @query = query
6
+ end
7
+
8
+ def doc_as_plaintext(doc)
9
+ # just using inner_text doesn't give us quite enough spaces :(
10
+ doc.css(@query).collect do |j|
11
+ bits = []
12
+ j.traverse do |c|
13
+ if c.text? then bits.push c.content end
14
+ end
15
+
16
+ bits.join(' ')
17
+ end.join(' ')
18
+ end
19
+ end
@@ -0,0 +1,50 @@
1
+ require 'uri'
2
+
3
+ class Web2Text::LinePrinter
4
+ def initialize(crawl, output)
5
+ @output = output
6
+ @first = true
7
+ end
8
+
9
+ def append(doc, uri)
10
+ if !@first then
11
+ @output.write "\n"
12
+ end
13
+ @first = false
14
+
15
+ @output.write doc.gsub(/\n+/, ' ')
16
+ self
17
+ end
18
+
19
+ def close
20
+ @output.close
21
+ end
22
+ end
23
+
24
+ # Writes one file per page
25
+ class Web2Text::FilePrinter
26
+ def initialize(crawl, out_dir)
27
+ root_path = URI(crawl.url).path.to_s
28
+ root_path = "/" if root_path.empty?
29
+
30
+ @crawl_root = Pathname(root_path)
31
+ @out_dir = Pathname(out_dir)
32
+ @out_dir.mkpath
33
+ end
34
+
35
+ def append(doc, uri)
36
+ path = @out_dir + Pathname(URI(uri).path).relative_path_from(@crawl_root)
37
+ if path.extname == "" then
38
+ path = path + 'index.txt'
39
+ end
40
+
41
+ path = path.sub_ext('.txt')
42
+
43
+ path.parent.mkpath
44
+ path.open("w") { |f| f.write(doc) }
45
+ self
46
+ end
47
+
48
+ def close
49
+ end
50
+ end
@@ -0,0 +1,3 @@
1
+ module Web2Text
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,45 @@
1
+ require 'web2text'
2
+
3
+ root = "http://example.com"
4
+
5
+ RSpec.describe Crawl, '#filter' do
6
+ context "with no patterns" do
7
+ it "returns all links" do
8
+ crawl = Crawl.new root
9
+ links = ["#{root}/wow", "#{root}/neat"]
10
+ expect(crawl.filter links).to eq links
11
+ expect(links.select {|u| crawl.focus? u}).to eq links
12
+ end
13
+
14
+ it "will not crawl above the root" do
15
+ crawl = Crawl.new "#{root}/wow/cool"
16
+ expect(crawl.skip? root).to be true
17
+ end
18
+ end
19
+
20
+ context "with patterns" do
21
+ it "can filter out whole directories" do
22
+ good = ["#{root}/wow", "#{root}/neat"]
23
+ bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
24
+
25
+ crawl = Crawl.new root, ["#{root}/avoid"]
26
+ expect(crawl.filter good + bad).to eq good
27
+ end
28
+
29
+ it "can focus on pages" do
30
+ bad = ["#{root}/avoid", "#{root}/avoid"]
31
+ good = ["#{root}/focus", "#{root}/focus/index.html", "#{root}/focus/this/nested/stuff"]
32
+
33
+ crawl = Crawl.new root, [], ["#{root}/focus"]
34
+ expect((good + bad).select {|u| crawl.focus? u}).to eq good
35
+ end
36
+
37
+ it "can skip host name parts to filter out directories" do
38
+ good = ["#{root}/wow", "#{root}/neat"]
39
+ bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
40
+
41
+ crawl = Crawl.new root, ["/avoid"]
42
+ expect(crawl.filter good + bad).to eq good
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,39 @@
1
+ require 'nokogiri'
2
+
3
+ require 'web2text'
4
+
5
+ RSpec.describe Crawler, '#process_doc' do
6
+ before(:all) do
7
+ @root = "http://example.com"
8
+ @crawl = Crawl.new @root
9
+
10
+ @h1_content = "This is a document"
11
+ @p_content = "good stuff!"
12
+
13
+ @example_html = Nokogiri::HTML "<!doctype html><html><head></head><body><h1>#{@h1_content}</h1><p>#{@p_content}</p></body></html>"
14
+ end
15
+
16
+ before(:each) do
17
+ @crawler = Crawler.new @crawl
18
+ end
19
+
20
+ it 'can consider a page and make output' do
21
+ out = @crawler.doc_as_plaintext @example_html
22
+ expect(out).to eq "#{@h1_content} #{@p_content}"
23
+ end
24
+
25
+ it 'can limit the output by using css queries' do
26
+ tests = [
27
+ ["p", @p_content],
28
+ ["h1", @h1_content],
29
+ ["p, h1", "#{@h1_content} #{@p_content}"],
30
+ ["h1, p", "#{@h1_content} #{@p_content}"]
31
+ ]
32
+
33
+ tests.each do |test|
34
+ @crawler = Crawler.new @crawl, test[0]
35
+ out = @crawler.doc_as_plaintext @example_html
36
+ expect(out).to eq(test[1]), "with css query '#{test[0]}', got '#{out}', but expected '#{test[1]}'"
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,81 @@
1
+ require 'rspec'
2
+ require 'stringio'
3
+ require 'test_construct'
4
+
5
+ require 'web2text'
6
+
7
+ LinePrinter = Web2Text::LinePrinter
8
+ FilePrinter = Web2Text::FilePrinter
9
+
10
+ doc1 = "This is a document\nwith a newline"
11
+ doc2 = "This is another document"
12
+ root = 'http://example.com/wow/'
13
+
14
+ RSpec.describe LinePrinter, '#append' do
15
+ it 'prints one line per document' do
16
+ crawl = Crawl.new root
17
+ result = StringIO::open do |out|
18
+ LinePrinter.new(crawl, out)
19
+ .append(doc1, "#{root}index.html")
20
+ .append(doc2, "#{root}/cool/index.html")
21
+
22
+ out.string
23
+ end
24
+
25
+ expect(result.lines.length).to eq(2)
26
+ expect(result.lines[1]).to eq(doc2)
27
+ end
28
+ end
29
+
30
+
31
+ RSpec.describe FilePrinter, '#append' do
32
+ include TestConstruct::Helpers
33
+
34
+ it 'prints one file per document' do
35
+ crawl = Crawl.new root
36
+ folder = 'test_output/'
37
+
38
+ within_construct() do |construct|
39
+ construct.directory 'fileprinter_web2text' do |d|
40
+ FilePrinter.new(crawl, folder)
41
+ .append(doc1, "#{root}/")
42
+ .append(doc2, "#{root}/cool/index.html")
43
+ .append(doc1, "#{root}/no_slash")
44
+
45
+ doc1_path = File.join folder, 'index.txt'
46
+ expect(File::file?(doc1_path)).to be_truthy
47
+ expect(IO.read(doc1_path)).to eq(doc1)
48
+
49
+ doc2_path = File.join(folder, 'cool', 'index.txt')
50
+ expect(File.file?(doc2_path)).to be_truthy
51
+ expect(IO.read(doc2_path)).to eq(doc2)
52
+
53
+ doc3_path = File.join folder, 'no_slash', 'index.txt'
54
+ expect(File::file?(doc3_path)).to be_truthy
55
+ expect(IO.read(doc3_path)).to eq(doc1)
56
+
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ RSpec.describe FilePrinter do
63
+ include TestConstruct::Helpers
64
+
65
+ it "doesn't choke on roots with no path (eg. http://example.com)" do
66
+ tricky = "http://example.com"
67
+ crawl = Crawl.new tricky
68
+ folder = 'test_output/'
69
+
70
+ within_construct() do |construct|
71
+ construct.directory 'fileprinter_web2text' do |d|
72
+ FilePrinter.new(crawl, folder)
73
+ .append(doc1, "#{tricky}/")
74
+
75
+ doc1_path = File.join folder, 'index.txt'
76
+ expect(File::file?(doc1_path)).to be_truthy
77
+ expect(IO.read(doc1_path)).to eq(doc1)
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,27 @@
1
+ require 'rspec'
2
+
3
+ require 'web2text'
4
+
5
+ Crawler = Web2Text::Crawler
6
+ Crawl = Web2Text::Crawl
7
+
8
+
9
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
10
+ RSpec.configure do |config|
11
+ config.expect_with :rspec do |expectations|
12
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
+ end
14
+
15
+ config.mock_with :rspec do |mocks|
16
+ mocks.verify_partial_doubles = true
17
+ end
18
+
19
+ config.warnings = true
20
+ config.order = :random
21
+
22
+ # Seed global randomization in this process using the `--seed` CLI option.
23
+ # Setting this allows you to use `--seed` to deterministically reproduce
24
+ # test failures related to randomization by passing the same `--seed` value
25
+ # as the one that triggered the failure.
26
+ Kernel.srand config.seed
27
+ end
@@ -0,0 +1,30 @@
1
+ require 'web2text'
2
+ require 'web2text/version'
3
+ require 'shellwords'
4
+
5
+ ROOT = "http://example.com"
6
+
7
+ def parse(args)
8
+ Web2Text.parse_cli "#{args} #{ROOT}".shellsplit
9
+ end
10
+
11
+ RSpec.describe Web2Text do
12
+ it 'has a semver VERSION' do
13
+ expect(Web2Text::VERSION =~ /\d+\.\d+\.\d+/).to be 0
14
+ end
15
+ end
16
+
17
+ RSpec.describe Web2Text, '#parse_cli' do
18
+ it 'defaults to 0 sleep' do
19
+ expect(parse('')[:sleep]).to be 0.0
20
+ end
21
+
22
+ it 'sleeps for 1s with -s' do
23
+ expect(parse('-s')[:sleep]).to be 1.0
24
+ end
25
+
26
+ it 'can specify sleep with -s N or --sleep N' do
27
+ expect(parse('-s 2')[:sleep]).to be 2.0
28
+ expect(parse('--sleep 2')[:sleep]).to be 2.0
29
+ end
30
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web2text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Alex Wilson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: anemone
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.6.6
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 1.6.6.2
37
+ type: :runtime
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: 1.6.6
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.6.6.2
47
+ - !ruby/object:Gem::Dependency
48
+ name: rspec
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '3.3'
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: 3.3.0
57
+ type: :development
58
+ prerelease: false
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: '3.3'
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 3.3.0
67
+ - !ruby/object:Gem::Dependency
68
+ name: rake
69
+ requirement: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - "~>"
72
+ - !ruby/object:Gem::Version
73
+ version: '10.4'
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: 10.4.2
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '10.4'
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ version: 10.4.2
87
+ - !ruby/object:Gem::Dependency
88
+ name: test_construct
89
+ requirement: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - "~>"
92
+ - !ruby/object:Gem::Version
93
+ version: '2.0'
94
+ type: :development
95
+ prerelease: false
96
+ version_requirements: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - "~>"
99
+ - !ruby/object:Gem::Version
100
+ version: '2.0'
101
+ description:
102
+ email:
103
+ executables:
104
+ - web2text
105
+ extensions: []
106
+ extra_rdoc_files: []
107
+ files:
108
+ - bin/web2text
109
+ - lib/web2text.rb
110
+ - lib/web2text/crawl.rb
111
+ - lib/web2text/crawler.rb
112
+ - lib/web2text/formatters.rb
113
+ - lib/web2text/version.rb
114
+ - spec/crawl_spec.rb
115
+ - spec/crawler_spec.rb
116
+ - spec/formatters_spec.rb
117
+ - spec/spec_helper.rb
118
+ - spec/web2text_spec.rb
119
+ homepage: https://github.com/yourpalal/web2text
120
+ licenses:
121
+ - MIT
122
+ metadata: {}
123
+ post_install_message:
124
+ rdoc_options: []
125
+ require_paths:
126
+ - lib
127
+ required_ruby_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubyforge_project:
139
+ rubygems_version: 2.2.2
140
+ signing_key:
141
+ specification_version: 4
142
+ summary: Scrape a website as plain text.
143
+ test_files: []