web2text 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6e1c4b836214eee9b4901d606ebf4210744e4fae
4
+ data.tar.gz: c045333d81e7e5e4a5d2fac1d1b39d9d9ab0194e
5
+ SHA512:
6
+ metadata.gz: ae8c3350043a9a9213b6ec63429238d582736ecb48b934e0a51329dc69e4fad775c5f5c764628fb6ff5367f9bc8d1ebec4e867e0b12081e1cd98724b100fd56f
7
+ data.tar.gz: 93853ba2c589dda156578037eaaf76fe9647ba2c8b468ddbe12ce2a995d92b28e3e05b11650efe3f1732b8fe43f5f7f78885aa4a62dffda92f71743f3ac64d11
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'web2text'
4
+
5
+ begin
6
+ options = Web2Text::parse_cli ARGV
7
+ Web2Text::do_crawl options
8
+ rescue Web2Text::Error => e
9
+ puts "#{e.to_s} Try -h for help"
10
+ end
@@ -0,0 +1,123 @@
1
+ require 'anemone'
2
+ require 'nokogiri'
3
+
4
+ require 'optparse'
5
+
6
+
7
+ module Web2Text
8
+ class Error < RuntimeError
9
+ def initialize(msg)
10
+ super msg
11
+ end
12
+ end
13
+
14
+ class CommandError < Error
15
+ def initialize(msg)
16
+ super msg
17
+ end
18
+ end
19
+
20
+ def self.parse_cli(args)
21
+ options = {
22
+ query: "body",
23
+ sleep: 0.0,
24
+ avoid: [],
25
+ focus: [],
26
+ formatter: LinePrinter,
27
+ ignore_robots_txt: false,
28
+ out: $stdout,
29
+ }
30
+
31
+ args = args.clone
32
+
33
+ OptionParser.new do |opts|
34
+ opts.banner = "Usage: web2text [options] http://example.com/"
35
+
36
+ opts.on("-q", "--css", "--query=CSS_QUERY", String) do |q|
37
+ options[:query] = q
38
+ end
39
+
40
+ opts.on("-s [OPTIONAL]", "--sleep [OPTIONAL]", Float, "Delay between requests. Default 1, -s sets to 1.") do |n|
41
+ options[:sleep] = n || 1.0
42
+ end
43
+
44
+ opts.on("--avoid x,y,z", Array, "List of paths to avoid when crawling. These paths and everything below them will be ignored.") do |avoid|
45
+ options[:avoid] = avoid
46
+ end
47
+
48
+ opts.on("--focus x,y,z", Array, "List of paths to process when crawling. Only these paths and pages below them will be processed") do |focus|
49
+ options[:focus] = focus
50
+ end
51
+
52
+
53
+ opts.on("--lines [web2.txt]", String, "One line per page. Can print to std out or a file.") do |f|
54
+ options[:formatter] = LinePrinter
55
+ options[:out] = if f then File.open(f, 'w') else $stdout end
56
+ end
57
+
58
+ opts.on("--files out/", String, "One file per page. Following website structure, in the specified directory.") do |o|
59
+ options[:formatter] = FilePrinter
60
+ options[:out] = Pathname(o)
61
+
62
+ if options[:out].exist? and !options[:out].directory? then
63
+ raise Web2Text::CommandError.new 'argument to --files must be a directory'
64
+ end
65
+ end
66
+
67
+ opts.on("--bad-robot", "Ignore robots.txt") do
68
+ options[:ignore_robots_txt] = true
69
+ end
70
+
71
+ opts.on_tail("-h", "--help", "Show this message") do
72
+ puts opts
73
+ exit
74
+ end
75
+ end.parse! args
76
+
77
+ if args.length != 1 then
78
+ raise Web2Text::CommandError.new 'incorrect number of arguments!'
79
+ end
80
+
81
+ options[:url] = args[0]
82
+ options
83
+ end
84
+
85
+ def self.do_crawl(options)
86
+ crawl = Crawl.new options[:url], options[:avoid], options[:focus]
87
+ crawler = Crawler.new crawl, options[:query]
88
+ formatter = options[:formatter].new crawl, options[:out]
89
+
90
+ Anemone.crawl(crawl.url, :obey_robots_txt => !options[:ignore_robots_txt]) do |anemone|
91
+ anemone.focus_crawl do |page|
92
+ crawl.filter page.links
93
+ end
94
+
95
+ anemone.on_every_page do |page|
96
+ STDERR.puts page.url
97
+
98
+ # ignore redirects
99
+ code = page.code || 200
100
+ if 300 <= code and code < 400
101
+ next
102
+ elsif !crawl.focus? page.url
103
+ next
104
+ elsif page.doc.nil?
105
+ STDERR.puts "ERR: Failed to retrieve #{page.url}"
106
+ next
107
+ end
108
+
109
+ plain = crawler.doc_as_plaintext page.doc
110
+ formatter.append plain, page.url
111
+ sleep options[:sleep]
112
+ end
113
+
114
+ anemone.after_crawl do
115
+ formatter.close
116
+ end
117
+ end
118
+ end
119
+ end
120
+
121
+ require 'web2text/crawl'
122
+ require 'web2text/crawler'
123
+ require 'web2text/formatters'
@@ -0,0 +1,45 @@
1
+ require 'uri'
2
+
3
+ class Web2Text::Crawl
4
+ attr_reader :url
5
+
6
+ def initialize(url, avoid = [], focus = [])
7
+ @url = url
8
+
9
+ @avoid = avoid.map { |a|
10
+ a = URI::join(url, a) if !a.start_with? url
11
+ a.to_s
12
+ }
13
+
14
+ @focus = focus.map { |a|
15
+ a = URI::join(url, a) if !a.start_with? url
16
+ a.to_s
17
+ }
18
+ end
19
+
20
+ def filter(urls)
21
+ urls.reject {|u| self.skip? u}
22
+ end
23
+
24
+ def skip?(url)
25
+ url_s = url.to_s
26
+ if !url_s.start_with? @url
27
+ return true
28
+ end
29
+
30
+ @avoid.any? { |a|
31
+ url_s.start_with? a
32
+ }
33
+ end
34
+
35
+ def focus?(url)
36
+ if @focus.empty?
37
+ true
38
+ else
39
+ url_s = url.to_s
40
+ @focus.any? { |a|
41
+ url_s.start_with? a
42
+ }
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,19 @@
1
+
2
+ class Web2Text::Crawler
3
+ def initialize(crawl, query="body")
4
+ @crawl = crawl
5
+ @query = query
6
+ end
7
+
8
+ def doc_as_plaintext(doc)
9
+ # just using inner_text doesn't give us quite enough spaces :(
10
+ doc.css(@query).collect do |j|
11
+ bits = []
12
+ j.traverse do |c|
13
+ if c.text? then bits.push c.content end
14
+ end
15
+
16
+ bits.join(' ')
17
+ end.join(' ')
18
+ end
19
+ end
@@ -0,0 +1,50 @@
1
+ require 'uri'
2
+
3
+ class Web2Text::LinePrinter
4
+ def initialize(crawl, output)
5
+ @output = output
6
+ @first = true
7
+ end
8
+
9
+ def append(doc, uri)
10
+ if !@first then
11
+ @output.write "\n"
12
+ end
13
+ @first = false
14
+
15
+ @output.write doc.gsub(/\n+/, ' ')
16
+ self
17
+ end
18
+
19
+ def close
20
+ @output.close
21
+ end
22
+ end
23
+
24
+ # Writes one file per page
25
+ class Web2Text::FilePrinter
26
+ def initialize(crawl, out_dir)
27
+ root_path = URI(crawl.url).path.to_s
28
+ root_path = "/" if root_path.empty?
29
+
30
+ @crawl_root = Pathname(root_path)
31
+ @out_dir = Pathname(out_dir)
32
+ @out_dir.mkpath
33
+ end
34
+
35
+ def append(doc, uri)
36
+ path = @out_dir + Pathname(URI(uri).path).relative_path_from(@crawl_root)
37
+ if path.extname == "" then
38
+ path = path + 'index.txt'
39
+ end
40
+
41
+ path = path.sub_ext('.txt')
42
+
43
+ path.parent.mkpath
44
+ path.open("w") { |f| f.write(doc) }
45
+ self
46
+ end
47
+
48
+ def close
49
+ end
50
+ end
@@ -0,0 +1,3 @@
1
+ module Web2Text
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,45 @@
1
+ require 'web2text'
2
+
3
+ root = "http://example.com"
4
+
5
+ RSpec.describe Crawl, '#filter' do
6
+ context "with no patterns" do
7
+ it "returns all links" do
8
+ crawl = Crawl.new root
9
+ links = ["#{root}/wow", "#{root}/neat"]
10
+ expect(crawl.filter links).to eq links
11
+ expect(links.select {|u| crawl.focus? u}).to eq links
12
+ end
13
+
14
+ it "will not crawl above the root" do
15
+ crawl = Crawl.new "#{root}/wow/cool"
16
+ expect(crawl.skip? root).to be true
17
+ end
18
+ end
19
+
20
+ context "with patterns" do
21
+ it "can filter out whole directories" do
22
+ good = ["#{root}/wow", "#{root}/neat"]
23
+ bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
24
+
25
+ crawl = Crawl.new root, ["#{root}/avoid"]
26
+ expect(crawl.filter good + bad).to eq good
27
+ end
28
+
29
+ it "can focus on pages" do
30
+ bad = ["#{root}/avoid", "#{root}/avoid"]
31
+ good = ["#{root}/focus", "#{root}/focus/index.html", "#{root}/focus/this/nested/stuff"]
32
+
33
+ crawl = Crawl.new root, [], ["#{root}/focus"]
34
+ expect((good + bad).select {|u| crawl.focus? u}).to eq good
35
+ end
36
+
37
+ it "can skip host name parts to filter out directories" do
38
+ good = ["#{root}/wow", "#{root}/neat"]
39
+ bad = ["#{root}/avoid", "#{root}/avoid/index.html", "#{root}/avoid/this/nested/stuff"]
40
+
41
+ crawl = Crawl.new root, ["/avoid"]
42
+ expect(crawl.filter good + bad).to eq good
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,39 @@
1
+ require 'nokogiri'
2
+
3
+ require 'web2text'
4
+
5
+ RSpec.describe Crawler, '#process_doc' do
6
+ before(:all) do
7
+ @root = "http://example.com"
8
+ @crawl = Crawl.new @root
9
+
10
+ @h1_content = "This is a document"
11
+ @p_content = "good stuff!"
12
+
13
+ @example_html = Nokogiri::HTML "<!doctype html><html><head></head><body><h1>#{@h1_content}</h1><p>#{@p_content}</p></body></html>"
14
+ end
15
+
16
+ before(:each) do
17
+ @crawler = Crawler.new @crawl
18
+ end
19
+
20
+ it 'can consider a page and make output' do
21
+ out = @crawler.doc_as_plaintext @example_html
22
+ expect(out).to eq "#{@h1_content} #{@p_content}"
23
+ end
24
+
25
+ it 'can limit the output by using css queries' do
26
+ tests = [
27
+ ["p", @p_content],
28
+ ["h1", @h1_content],
29
+ ["p, h1", "#{@h1_content} #{@p_content}"],
30
+ ["h1, p", "#{@h1_content} #{@p_content}"]
31
+ ]
32
+
33
+ tests.each do |test|
34
+ @crawler = Crawler.new @crawl, test[0]
35
+ out = @crawler.doc_as_plaintext @example_html
36
+ expect(out).to eq(test[1]), "with css query '#{test[0]}', got '#{out}', but expected '#{test[1]}'"
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,81 @@
1
+ require 'rspec'
2
+ require 'stringio'
3
+ require 'test_construct'
4
+
5
+ require 'web2text'
6
+
7
+ LinePrinter = Web2Text::LinePrinter
8
+ FilePrinter = Web2Text::FilePrinter
9
+
10
+ doc1 = "This is a document\nwith a newline"
11
+ doc2 = "This is another document"
12
+ root = 'http://example.com/wow/'
13
+
14
+ RSpec.describe LinePrinter, '#append' do
15
+ it 'prints one line per document' do
16
+ crawl = Crawl.new root
17
+ result = StringIO::open do |out|
18
+ LinePrinter.new(crawl, out)
19
+ .append(doc1, "#{root}index.html")
20
+ .append(doc2, "#{root}/cool/index.html")
21
+
22
+ out.string
23
+ end
24
+
25
+ expect(result.lines.length).to eq(2)
26
+ expect(result.lines[1]).to eq(doc2)
27
+ end
28
+ end
29
+
30
+
31
+ RSpec.describe FilePrinter, '#append' do
32
+ include TestConstruct::Helpers
33
+
34
+ it 'prints one file per document' do
35
+ crawl = Crawl.new root
36
+ folder = 'test_output/'
37
+
38
+ within_construct() do |construct|
39
+ construct.directory 'fileprinter_web2text' do |d|
40
+ FilePrinter.new(crawl, folder)
41
+ .append(doc1, "#{root}/")
42
+ .append(doc2, "#{root}/cool/index.html")
43
+ .append(doc1, "#{root}/no_slash")
44
+
45
+ doc1_path = File.join folder, 'index.txt'
46
+ expect(File::file?(doc1_path)).to be_truthy
47
+ expect(IO.read(doc1_path)).to eq(doc1)
48
+
49
+ doc2_path = File.join(folder, 'cool', 'index.txt')
50
+ expect(File.file?(doc2_path)).to be_truthy
51
+ expect(IO.read(doc2_path)).to eq(doc2)
52
+
53
+ doc3_path = File.join folder, 'no_slash', 'index.txt'
54
+ expect(File::file?(doc3_path)).to be_truthy
55
+ expect(IO.read(doc3_path)).to eq(doc1)
56
+
57
+ end
58
+ end
59
+ end
60
+ end
61
+
62
+ RSpec.describe FilePrinter do
63
+ include TestConstruct::Helpers
64
+
65
+ it "doesn't choke on roots with no path (eg. http://example.com)" do
66
+ tricky = "http://example.com"
67
+ crawl = Crawl.new tricky
68
+ folder = 'test_output/'
69
+
70
+ within_construct() do |construct|
71
+ construct.directory 'fileprinter_web2text' do |d|
72
+ FilePrinter.new(crawl, folder)
73
+ .append(doc1, "#{tricky}/")
74
+
75
+ doc1_path = File.join folder, 'index.txt'
76
+ expect(File::file?(doc1_path)).to be_truthy
77
+ expect(IO.read(doc1_path)).to eq(doc1)
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,27 @@
1
+ require 'rspec'
2
+
3
+ require 'web2text'
4
+
5
+ Crawler = Web2Text::Crawler
6
+ Crawl = Web2Text::Crawl
7
+
8
+
9
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
10
+ RSpec.configure do |config|
11
+ config.expect_with :rspec do |expectations|
12
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
13
+ end
14
+
15
+ config.mock_with :rspec do |mocks|
16
+ mocks.verify_partial_doubles = true
17
+ end
18
+
19
+ config.warnings = true
20
+ config.order = :random
21
+
22
+ # Seed global randomization in this process using the `--seed` CLI option.
23
+ # Setting this allows you to use `--seed` to deterministically reproduce
24
+ # test failures related to randomization by passing the same `--seed` value
25
+ # as the one that triggered the failure.
26
+ Kernel.srand config.seed
27
+ end
@@ -0,0 +1,30 @@
1
+ require 'web2text'
2
+ require 'web2text/version'
3
+ require 'shellwords'
4
+
5
+ ROOT = "http://example.com"
6
+
7
+ def parse(args)
8
+ Web2Text.parse_cli "#{args} #{ROOT}".shellsplit
9
+ end
10
+
11
+ RSpec.describe Web2Text do
12
+ it 'has a semver VERSION' do
13
+ expect(Web2Text::VERSION =~ /\d+\.\d+\.\d+/).to be 0
14
+ end
15
+ end
16
+
17
+ RSpec.describe Web2Text, '#parse_cli' do
18
+ it 'defaults to 0 sleep' do
19
+ expect(parse('')[:sleep]).to be 0.0
20
+ end
21
+
22
+ it 'sleeps for 1s with -s' do
23
+ expect(parse('-s')[:sleep]).to be 1.0
24
+ end
25
+
26
+ it 'can specify sleep with -s N or --sleep N' do
27
+ expect(parse('-s 2')[:sleep]).to be 2.0
28
+ expect(parse('--sleep 2')[:sleep]).to be 2.0
29
+ end
30
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web2text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Alex Wilson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: anemone
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 1.6.6
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 1.6.6.2
37
+ type: :runtime
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: 1.6.6
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.6.6.2
47
+ - !ruby/object:Gem::Dependency
48
+ name: rspec
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '3.3'
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: 3.3.0
57
+ type: :development
58
+ prerelease: false
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: '3.3'
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 3.3.0
67
+ - !ruby/object:Gem::Dependency
68
+ name: rake
69
+ requirement: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - "~>"
72
+ - !ruby/object:Gem::Version
73
+ version: '10.4'
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ version: 10.4.2
77
+ type: :development
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '10.4'
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ version: 10.4.2
87
+ - !ruby/object:Gem::Dependency
88
+ name: test_construct
89
+ requirement: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - "~>"
92
+ - !ruby/object:Gem::Version
93
+ version: '2.0'
94
+ type: :development
95
+ prerelease: false
96
+ version_requirements: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - "~>"
99
+ - !ruby/object:Gem::Version
100
+ version: '2.0'
101
+ description:
102
+ email:
103
+ executables:
104
+ - web2text
105
+ extensions: []
106
+ extra_rdoc_files: []
107
+ files:
108
+ - bin/web2text
109
+ - lib/web2text.rb
110
+ - lib/web2text/crawl.rb
111
+ - lib/web2text/crawler.rb
112
+ - lib/web2text/formatters.rb
113
+ - lib/web2text/version.rb
114
+ - spec/crawl_spec.rb
115
+ - spec/crawler_spec.rb
116
+ - spec/formatters_spec.rb
117
+ - spec/spec_helper.rb
118
+ - spec/web2text_spec.rb
119
+ homepage: https://github.com/yourpalal/web2text
120
+ licenses:
121
+ - MIT
122
+ metadata: {}
123
+ post_install_message:
124
+ rdoc_options: []
125
+ require_paths:
126
+ - lib
127
+ required_ruby_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ required_rubygems_version: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
137
+ requirements: []
138
+ rubyforge_project:
139
+ rubygems_version: 2.2.2
140
+ signing_key:
141
+ specification_version: 4
142
+ summary: Scrape a website as plain text.
143
+ test_files: []