rubyretriever 1.2.4 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a5749cf55198f97bab6c77297bf6409a2518bca0
4
- data.tar.gz: df792b6d3b1d03a8b70faadf651e20779f4fd1e8
3
+ metadata.gz: 05f8e6c0169af87c8284c8b6e98d5f25488b0980
4
+ data.tar.gz: a45a361b215b5ae7832e762b08bbdb989d0847a1
5
5
  SHA512:
6
- metadata.gz: 76e12598873e8779e196f84ec040a5c9fef01e410c884fa88b18f41335848a69dea778459bbb9629f941dbe12c8ab2c57032d24007a1f4fd57c5caec3a76abea
7
- data.tar.gz: 13b91c96ce17eb8ce802250505943d13efbd292cd95b120336ddd0ddd98278c513310c68eb9606a92b4018ef6ec9369369ee5869eff0abb589457c7b3bde41c3
6
+ metadata.gz: 8cee32f96e0ea0fe003a109016c6b17f9ddde9a73d72dbfd0a95c63e413b87b41c2ccf5bbc86b9886f78f59f053fbcd27aad0cbbbededbe8b002f0f7d986c528
7
+ data.tar.gz: fd0762069a69f7383a59b4058b46bde46793defec437eb1525e69f98f60e4429c1dc676d2d8ae3fa828c8d82d2018a9e9685bb15537424cb3e15fe0d5c472ade
data/bin/rr CHANGED
@@ -32,7 +32,7 @@ optparse = OptionParser.new do |opts|
32
32
  opts.on('-p', '--progress', 'Output progress bar') do
33
33
  options['progress'] = true
34
34
  end
35
- options['maxpages'] = false
35
+ options['maxpages'] = 100
36
36
  opts.on('-l',
37
37
  '--limit PAGE_LIMIT_#',
38
38
  'set a max on the total number of crawled pages') do |maxp|
@@ -0,0 +1,13 @@
1
+ require 'htmlentities'
2
+ #
3
+ module SourceString
4
+ refine String do
5
+ def decode_html
6
+ HTMLEntities.new.decode(self)
7
+ end
8
+
9
+ def encode_utf8_and_replace
10
+ encode('UTF-8', invalid: :replace, undef: :replace)
11
+ end
12
+ end
13
+ end
@@ -15,6 +15,7 @@ module Retriever
15
15
  # There is no direct output
16
16
  # this is a parent class that the other fetch classes build off of.
17
17
  def initialize(url, options)
18
+ @iterator = false
18
19
  @result = []
19
20
  @connection_tally = {
20
21
  success: 0,
@@ -27,6 +28,9 @@ module Retriever
27
28
  @t = Retriever::Target.new(url, @file_re)
28
29
  @output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
29
30
  @already_crawled = setup_bloom_filter
31
+ end
32
+
33
+ def start
30
34
  @page_one = crawl_page_one
31
35
  @link_stack = create_link_stack
32
36
  @temp_link_stack = []
@@ -80,6 +84,39 @@ module Retriever
80
84
  puts
81
85
  end
82
86
 
87
+ # returns true is resp is ok to continue
88
+ def good_response?(resp, url)
89
+ return false unless resp
90
+ hdr = resp.response_header
91
+ if hdr.redirection?
92
+ loc = hdr.location
93
+ lg("#{url} Redirected to #{loc}")
94
+ if t.host_re =~ loc
95
+ @temp_link_stack.push(loc) unless @already_crawled.include?(loc)
96
+ lg('--Added to stack for later')
97
+ return false
98
+ end
99
+ lg("Redirection outside of target host. No - go. #{loc}")
100
+ return false
101
+ end
102
+ # lets not continue if unsuccessful connection
103
+ unless hdr.successful?
104
+ lg("UNSUCCESSFUL CONNECTION -- #{url}")
105
+ @connection_tally[:error] += 1
106
+ @connection_tally[:error_server] += 1 if hdr.server_error?
107
+ @connection_tally[:error_client] += 1 if hdr.client_error?
108
+ return false
109
+ end
110
+ # let's not continue if not text/html
111
+ unless hdr['CONTENT_TYPE'] =~ %r{(text/html|application/xhtml+xml)}
112
+ @already_crawled.insert(url)
113
+ lg("Page Not text/html -- #{url}")
114
+ return false
115
+ end
116
+ @connection_tally[:success] += 1
117
+ true
118
+ end
119
+
83
120
  private
84
121
 
85
122
  def setup_options(options)
@@ -119,7 +156,7 @@ module Retriever
119
156
  end
120
157
 
121
158
  def crawl_page_one
122
- page_one = Retriever::Page.new(@t.source, @t)
159
+ page_one = Retriever::Page.new(@t.target, @t.source, @t)
123
160
  lg("URL Crawled: #{@t.target}")
124
161
  page_one
125
162
  end
@@ -141,13 +178,13 @@ module Retriever
141
178
 
142
179
  # iterates over the existing @link_stack
143
180
  # running until we reach the @max_pages value.
144
- def async_crawl_and_collect
181
+ def async_crawl_and_collect(&block)
145
182
  while @already_crawled.size < @max_pages
146
183
  if @link_stack.empty?
147
184
  end_crawl_notice
148
185
  break
149
186
  end
150
- new_links_arr = process_link_stack
187
+ new_links_arr = process_link_stack(&block)
151
188
  @temp_link_stack = []
152
189
  next if new_links_arr.nil? || new_links_arr.empty?
153
190
  @link_stack.concat(new_links_arr)
@@ -157,47 +194,14 @@ module Retriever
157
194
  @result.uniq!
158
195
  end
159
196
 
160
- # returns true is resp is ok to continue
161
- def good_response?(resp, url)
162
- return false unless resp
163
- hdr = resp.response_header
164
- if hdr.redirection?
165
- loc = hdr.location
166
- lg("#{url} Redirected to #{loc}")
167
- if t.host_re =~ loc
168
- @temp_link_stack.push(loc) unless @already_crawled.include?(loc)
169
- lg('--Added to stack for later')
170
- return false
171
- end
172
- lg("Redirection outside of target host. No - go. #{loc}")
173
- return false
174
- end
175
- # lets not continue if unsuccessful connection
176
- unless hdr.successful?
177
- lg("UNSUCCESSFUL CONNECTION -- #{url}")
178
- @connection_tally[:error] += 1
179
- @connection_tally[:error_server] += 1 if hdr.server_error?
180
- @connection_tally[:error_client] += 1 if hdr.client_error?
181
- return false
182
- end
183
- # let's not continue if not text/html
184
- unless hdr['CONTENT_TYPE'].include?('text/html')
185
- @already_crawled.insert(url)
186
- lg("Page Not text/html -- #{url}")
187
- return false
188
- end
189
- @connection_tally[:success] += 1
190
- true
191
- end
192
-
193
- def push_seo_to_data(url, new_page)
197
+ def push_seo_to_result(url, new_page)
194
198
  seos = [url]
195
199
  seos.concat(new_page.parse_seo)
196
200
  @result.push(seos)
197
201
  lg('--page SEO scraped')
198
202
  end
199
203
 
200
- def push_files_to_data(new_page)
204
+ def push_files_to_result(new_page)
201
205
  filez = new_page.parse_files(new_page.parse_internal)
202
206
  @result.concat(filez) unless filez.empty?
203
207
  lg("--#{filez.size} files found")
@@ -209,7 +213,7 @@ module Retriever
209
213
  if @progress && (@already_crawled.size < @max_pages)
210
214
  @progressbar.increment
211
215
  end
212
- Retriever::Page.new(response, @t)
216
+ Retriever::Page.new(url, response, @t)
213
217
  end
214
218
 
215
219
  def new_visitable_links(current_page)
@@ -217,10 +221,16 @@ module Retriever
217
221
  current_page.parse_internal_visitable
218
222
  end
219
223
 
224
+ def push_custom_to_result(url, current_page, &block)
225
+ data = block.call current_page
226
+ @result.push(data) unless data.empty?
227
+ lg("-- PageIterator called on: #{url}")
228
+ end
229
+
220
230
  # send a new wave of GET requests, using current @link_stack
221
231
  # at end of the loop it empties link_stack
222
232
  # puts new links into temporary stack
223
- def process_link_stack
233
+ def process_link_stack(&block)
224
234
  EM.synchrony do
225
235
  concurrency = 10
226
236
  EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
@@ -230,20 +240,19 @@ module Retriever
230
240
  next unless good_response?(resp, url)
231
241
  current_page = page_from_response(url, resp.response)
232
242
  # non-link dependent modes
233
- push_seo_to_data(url, current_page) if @seo
243
+ push_seo_to_result(url, current_page) if @seo
244
+ push_custom_to_result(url, current_page, &block) if @iterator
234
245
  next unless current_page.links.size > 0
235
246
  @temp_link_stack.push(new_visitable_links(current_page))
236
247
  # link dependent modes
237
248
  next unless @fileharvest
238
- push_files_to_data(current_page)
249
+ push_files_to_result(current_page)
239
250
  end
240
251
  EventMachine.stop
241
252
  end
242
253
  # empty the stack. most clean way
243
254
  @link_stack = []
244
255
  # temp contains redirects + new visitable links
245
- # we will re-initialize it as empty right after this function
246
- # in the parent method 'async crawl and collect'
247
256
  @temp_link_stack.flatten.uniq!
248
257
  end
249
258
  end
@@ -5,6 +5,7 @@ module Retriever
5
5
  class FetchFiles < Fetch
6
6
  def initialize(url, options)
7
7
  super
8
+ start
8
9
  temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
9
10
  @result.concat(temp_file_collection) if temp_file_collection.size > 0
10
11
  lg("#{@result.size} new files found")
@@ -6,6 +6,7 @@ module Retriever
6
6
  # on all unique pages found on the site
7
7
  def initialize(url, options)
8
8
  super
9
+ start
9
10
  @result.push(@page_one.parse_seo)
10
11
 
11
12
  async_crawl_and_collect
@@ -5,6 +5,7 @@ module Retriever
5
5
  # returns an array of all unique pages found on the site
6
6
  def initialize(url, options)
7
7
  super
8
+ start
8
9
  @result.push(@t.target)
9
10
  @result.concat(@link_stack)
10
11
 
@@ -1,5 +1,6 @@
1
1
  require 'addressable/uri'
2
-
2
+ #
3
+ using SourceString
3
4
  module Retriever
4
5
  #
5
6
  class Page
@@ -30,11 +31,12 @@ module Retriever
30
31
  wmv|flv|mp3|wav|doc|txt|ico|xml)
31
32
  /ix).freeze
32
33
 
33
- attr_reader :links, :source, :t
34
+ attr_reader :links, :source, :t, :url
34
35
 
35
- def initialize(source, t)
36
+ def initialize(url, source, t)
37
+ @url = url
36
38
  @t = t
37
- @source = source.encode('UTF-8', invalid: :replace, undef: :replace)
39
+ @source = source.encode_utf8_and_replace
38
40
  @links = nil
39
41
  end
40
42
 
@@ -59,24 +61,24 @@ module Retriever
59
61
  parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
60
62
  end
61
63
 
62
- def parse_files(arr)
64
+ def parse_files(arr = parse_internal)
63
65
  arr.select { |x| @t.file_re =~ x }
64
66
  end
65
67
 
66
68
  def title
67
- TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
69
+ TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
68
70
  end
69
71
 
70
72
  def desc
71
- DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
73
+ DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : ''
72
74
  end
73
75
 
74
76
  def h1
75
- H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
77
+ H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : ''
76
78
  end
77
79
 
78
80
  def h2
79
- H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
81
+ H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : ''
80
82
  end
81
83
 
82
84
  def parse_seo
@@ -0,0 +1,21 @@
1
+ module Retriever
2
+ #
3
+ class PageIterator < Fetch
4
+ # recieves target url and RR options, and a block
5
+ # runs the block on all pages during crawl, pushing
6
+ # the returned value of the block onto a result stack
7
+ # the complete data returned from the crawl is accessible thru self.result
8
+ def initialize(url, options, &block)
9
+ super
10
+ start
11
+ fail 'block required for PageIterator' unless block_given?
12
+ @iterator = true
13
+ @result.push(block.call @page_one)
14
+ lg("-- PageIterator crawled- #{url}")
15
+ async_crawl_and_collect(&block)
16
+ # done, make sure progress bar says we are done
17
+ @progressbar.finish if @progress
18
+ @result.sort_by! { |x| x.length } if @result.size > 1
19
+ end
20
+ end
21
+ end
@@ -1,4 +1,4 @@
1
1
  #
2
2
  module Retriever
3
- VERSION = '1.2.4'
3
+ VERSION = '1.3.0'
4
4
  end
data/lib/retriever.rb CHANGED
@@ -1,7 +1,9 @@
1
+ require 'retriever/core_ext'
1
2
  require 'retriever/fetch'
2
3
  require 'retriever/fetchfiles'
3
4
  require 'retriever/fetchsitemap'
4
5
  require 'retriever/fetchseo'
6
+ require 'retriever/page_iterator'
5
7
  require 'retriever/cli'
6
8
  require 'retriever/link'
7
9
  require 'retriever/target'
data/readme.md CHANGED
@@ -25,7 +25,8 @@ Features
25
25
 
26
26
  Use cases
27
27
  ---------
28
- RubyRetriever can do multiple things for you. With a single command at the terminal, RR can:
28
+ RubyRetriever can do multiple things for you. As an Executable
29
+ With a single command at the terminal, RR can:
29
30
  1. Crawl your website and output a *valid XML sitemap* based on what it found.
30
31
  2. Crawl a target website and *download all files of a given filetype*.
31
32
  3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
@@ -36,41 +37,44 @@ Getting started
36
37
  -----------
37
38
  Install the gem
38
39
  ```sh
39
- gem install rubyretriever
40
+ $ gem install rubyretriever
40
41
  ```
42
+
41
43
 
44
+ Using the Executable
45
+ --------------------
42
46
  **Example: Sitemap mode**
43
47
  ```sh
44
- rr --sitemap CSV --progress --limit 100 http://www.cnet.com
48
+ $ rr --sitemap CSV --progress --limit 10 http://www.cnet.com
45
49
  ```
46
50
  OR -- SAME COMMAND
47
51
  ```sh
48
- rr -s csv -p -l 100 http://www.cnet.com
52
+ $ rr -s csv -p -l 10 http://www.cnet.com
49
53
  ```
50
54
 
51
- This would map http://www.cnet.com until it crawled a max of 100 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
55
+ This would map http://www.cnet.com until it crawled a max of 10 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
52
56
 
53
57
  **Example: File Harvesting mode**
54
58
  ```sh
55
- rr --files pdf --progress --limit 1000 --out hubspot http://www.hubspot.com
59
+ $ rr --files txt --verbose --limit 1 http://textfiles.com/programming/
56
60
  ```
57
61
  OR -- SAME COMMAND
58
62
  ```sh
59
- rr -f pdf -p -l 100 http://www.hubspot.com
63
+ $ rr -f txt -v -l 1 http://textfiles.com/programming/
60
64
  ```
61
65
 
62
- This would crawl http://www.hubspot.com looking for filetype:PDF until it hit a max of 100 pages, then write out a list of filepaths to a CSV named hubspot (based on the website host name). Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
66
+ This would crawl http://textfiles.com/programming/ looking for txt files for only a single page, then write out a list of filepaths to txt files to the terminal. Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
63
67
 
64
68
  **Example: SEO mode**
65
69
  ```sh
66
- rr --seo --progress --limit 100 --out cnet-seo http://www.cnet.com
70
+ $ rr --seo --progress --limit 10 --out cnet-seo http://www.cnet.com
67
71
  ```
68
72
  OR -- SAME COMMAND
69
73
  ```sh
70
- rr -e -p -l 10 -o cnet-seo http://www.cnet.com
74
+ $ rr -e -p -l 10 -o cnet-seo http://www.cnet.com
71
75
  ```
72
76
 
73
- This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
77
+ This would go to http://www.cnet.com and crawl a max of 10 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
74
78
 
75
79
 
76
80
  command-line arguments
@@ -89,11 +93,48 @@ and OPTIONS is the applicable:
89
93
  -l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
90
94
  -h, --help *Display this screen*
91
95
 
96
+
97
+ Using as a Library (starting as of version 1.3.0 -- yet to be released)
98
+ ------------------
99
+
100
+ If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.
101
+
102
+ Sample Script using **PageIterator**
103
+ ```ruby
104
+ require 'retriever'
105
+ opts = {
106
+ 'maxpages' => 1
107
+ }
108
+ t = Retriever::PageIterator.new('http://www.basecamp.com', opts) do |page|
109
+ [page.url, page.title]
110
+ end
111
+ puts t.result.to_s
112
+ ```
113
+
114
+ ```sh
115
+ >> [["http://www.basecamp.com", "Basecamp is everyone’s favorite project management app."]]
116
+ ```
117
+ Available methods on the page iterator:
118
+ * **#url** - returns full URL of current page
119
+ * **#source** - returns raw page source code
120
+ * **#title** - returns html decoded verson of curent page title
121
+ * **#desc** - returns html decoded verson of curent page meta description
122
+ * **#h1** - returns html decoded verson of current page's h1 tag
123
+ * **#h2** - returns html decoded verson of current page's h2 tag
124
+ * **#links** - returns array of all links on the page
125
+ * **#parse_internal** - returns array of current page's internal (same host) links
126
+ * **#parse_internal_visitable** - returns #parse_internal plus added filtering of only links that are visitable
127
+ * **#parse_seo** - returns array of current page's html decoded title, desc, h1 and h2
128
+ * **#parse_files** - returns array of downloaded files of type supplied as RR options (fileharvest options)
129
+
130
+
92
131
  Current Requirements
93
132
  ------------
94
133
  em-synchrony
95
134
  ruby-progressbar
96
135
  bloomfilter-rb
136
+ addressable
137
+ htmlentities
97
138
 
98
139
  License
99
140
  -------
data/spec/link_spec.rb CHANGED
@@ -3,7 +3,7 @@ require 'retriever'
3
3
  describe 'Link' do
4
4
 
5
5
  t = Retriever::Target.new('http://www.cnet.com/reviews/')
6
- let(:links) { Retriever::Page.new(@source, t).links }
6
+ let(:links) { Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links }
7
7
 
8
8
  it 'collects links in anchor tags' do
9
9
  @source = (<<SOURCE).strip
data/spec/page_spec.rb CHANGED
@@ -4,9 +4,18 @@ require 'retriever/fetch'
4
4
  t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
5
5
 
6
6
  describe 'Page' do
7
+ describe '#url' do
8
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
9
+ it 'returns current page URL' do
10
+ @source = (<<SOURCE).strip
11
+ <a href='http://www.cnet.com/'>download</a>
12
+ SOURCE
13
+ expect(page.url).to eq('http://www.cnet.com/')
14
+ end
15
+ end
7
16
 
8
17
  describe '#links' do
9
- let(:links) { Retriever::Page.new(@source, t).links }
18
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
10
19
  it 'collects all unique href links on the page' do
11
20
  @source = (<<SOURCE).strip
12
21
  <a href='www.cnet.com/download.exe'>download</a>
@@ -17,12 +26,12 @@ describe 'Page' do
17
26
  <a href='http://www.yahoo.com/test/'>yahoo</a>
18
27
  SOURCE
19
28
 
20
- expect(links.size).to eq(4)
29
+ expect(page.links.size).to eq(4)
21
30
  end
22
31
  end
23
32
 
24
33
  describe '#parse_internal' do
25
- let(:page) { Retriever::Page.new(@source, t) }
34
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
26
35
  let(:links) { page.parse_internal }
27
36
  it 'filters links by host' do
28
37
  @source = (<<SOURCE).strip
@@ -35,7 +44,7 @@ SOURCE
35
44
  end
36
45
 
37
46
  describe '#parse_internal_visitable' do
38
- let(:page) { Retriever::Page.new(@source, t) }
47
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
39
48
  let(:links) { page.parse_internal_visitable }
40
49
  it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
41
50
  @source = (<<SOURCE).strip
@@ -46,7 +55,7 @@ SOURCE
46
55
  end
47
56
 
48
57
  describe '#parse_files' do
49
- let(:page) { Retriever::Page.new(@source, t) }
58
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
50
59
  let(:files) { page.parse_files(page.parse_internal) }
51
60
  it 'filters links by filetype' do
52
61
  @source = (<<SOURCE).strip
@@ -59,7 +68,7 @@ SOURCE
59
68
  end
60
69
 
61
70
  describe '#title' do
62
- let(:page) { Retriever::Page.new(@source, t) }
71
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
63
72
  it 'returns page title' do
64
73
  @source = (<<SOURCE).strip
65
74
  <title>test</title>
@@ -68,7 +77,7 @@ SOURCE
68
77
  end
69
78
  end
70
79
  describe '#desc' do
71
- let(:page) { Retriever::Page.new(@source, t) }
80
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
72
81
  it 'returns meta description' do
73
82
  @source = (<<SOURCE).strip
74
83
  <meta name='description' content="test2 ">
@@ -77,7 +86,7 @@ SOURCE
77
86
  end
78
87
  end
79
88
  describe '#h1' do
80
- let(:page) { Retriever::Page.new(@source, t) }
89
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
81
90
  it 'returns h1 text' do
82
91
  @source = (<<SOURCE).strip
83
92
  <h1>test 3</h1>
@@ -86,7 +95,7 @@ SOURCE
86
95
  end
87
96
  end
88
97
  describe '#h2' do
89
- let(:page) { Retriever::Page.new(@source, t) }
98
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
90
99
  it 'returns h2 text' do
91
100
  @source = (<<SOURCE).strip
92
101
  <h2> test 4 </h2>
@@ -1,4 +1,69 @@
1
1
  require 'retriever'
2
2
 
3
3
  describe 'Fetch' do
4
+ describe '#good_response?' do
5
+ let(:r) do
6
+ Retriever::Fetch.new('http://www.yahoo.com', {})
7
+ end
8
+
9
+ let(:resp) do
10
+ {}
11
+ end
12
+
13
+ let(:nil_response) do
14
+ r.good_response?(nil,'http://www.yahoo.com')
15
+ end
16
+
17
+ let(:unsuccessful_resp) do
18
+ resp.stub(:response_header).and_return(resp)
19
+ resp.stub(:redirection?).and_return(false)
20
+ resp.stub(:successful?).and_return(false)
21
+ resp.stub(:server_error?).and_return(false)
22
+ resp.stub(:client_error?).and_return(false)
23
+ r.good_response?(resp,'http://www.yahoo.com')
24
+ end
25
+
26
+ let(:redir_resp) do
27
+ resp.stub(:response_header).and_return(resp)
28
+ resp.stub(:redirection?).and_return(true)
29
+ resp.stub(:location).and_return('http://www.google.com')
30
+ r.good_response?(resp,'http://www.yahoo.com')
31
+ end
32
+
33
+ let(:bad_content_type_resp) do
34
+ resp.stub(:response_header).and_return(resp)
35
+ resp.stub(:redirection?).and_return(false)
36
+ resp.stub(:successful?).and_return(true)
37
+ resp['CONTENT_TYPE'] = 'image/jpeg'
38
+ r.good_response?(resp,'http://www.yahoo.com')
39
+ end
40
+
41
+ let(:success_resp) do
42
+ resp.stub(:response_header).and_return(resp)
43
+ resp.stub(:redirection?).and_return(false)
44
+ resp.stub(:successful?).and_return(true)
45
+ resp['CONTENT_TYPE'] = 'text/html'
46
+ r.good_response?(resp,'http://www.yahoo.com')
47
+ end
48
+
49
+ it 'returns false if the response is empty' do
50
+ expect(nil_response).to eq(false)
51
+ end
52
+
53
+ it 'returns false on unsuccessful connection' do
54
+ expect(unsuccessful_resp).to eq(false)
55
+ end
56
+
57
+ it 'returns false on redirecting host' do
58
+ expect(redir_resp).to eq(false)
59
+ end
60
+
61
+ it 'returns false on non-visitable content type' do
62
+ expect(bad_content_type_resp).to eq(false)
63
+ end
64
+
65
+ it 'returns true otherwise' do
66
+ expect(success_resp).to eq(true)
67
+ end
68
+ end
4
69
  end
data/spec/target_spec.rb CHANGED
@@ -1,9 +1,10 @@
1
1
  require 'retriever'
2
2
  require 'open-uri'
3
3
 
4
- t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
5
-
6
4
  describe 'Target' do
5
+ let(:t) do
6
+ Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
7
+ end
7
8
 
8
9
  it 'creates target var' do
9
10
  expect(t.target).to eq('http://www.cnet.com/reviews/')
@@ -30,6 +31,9 @@ describe 'Target' do
30
31
  end
31
32
 
32
33
  describe '#source' do
34
+ let(:redirecting_url) do
35
+ Retriever::Target.new('http://software-by-joe.appspot.com').source
36
+ end
33
37
 
34
38
  it 'opens URL and returns source as String' do
35
39
  expect(Retriever::Target.new('http://techcrunch.com/').source.class)
@@ -37,8 +41,7 @@ describe 'Target' do
37
41
  end
38
42
 
39
43
  it 'fails if target redirects to new host' do
40
- expect { Retriever::Target.new('http://tinyurl.com/nkfkypa').source }
41
- .to raise_error
44
+ expect { redirecting_url }.to raise_error
42
45
  end
43
46
  end
44
47
  end
metadata CHANGED
@@ -1,125 +1,139 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.4
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-16 00:00:00.000000000 Z
11
+ date: 2014-06-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: em-http-request
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: ruby-progressbar
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '>='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: bloomfilter-rb
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: addressable
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>='
73
+ - - ">="
74
74
  - !ruby/object:Gem::Version
75
75
  version: '0'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>='
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: htmlentities
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
81
95
  - !ruby/object:Gem::Version
82
96
  version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: bundler
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ~>
101
+ - - "~>"
88
102
  - !ruby/object:Gem::Version
89
103
  version: '1.6'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - ~>
108
+ - - "~>"
95
109
  - !ruby/object:Gem::Version
96
110
  version: '1.6'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: rake
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
- - - ~>
115
+ - - "~>"
102
116
  - !ruby/object:Gem::Version
103
117
  version: '10.3'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
- - - ~>
122
+ - - "~>"
109
123
  - !ruby/object:Gem::Version
110
124
  version: '10.3'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: rspec
113
127
  requirement: !ruby/object:Gem::Requirement
114
128
  requirements:
115
- - - ~>
129
+ - - "~>"
116
130
  - !ruby/object:Gem::Version
117
131
  version: '2.14'
118
132
  type: :development
119
133
  prerelease: false
120
134
  version_requirements: !ruby/object:Gem::Requirement
121
135
  requirements:
122
- - - ~>
136
+ - - "~>"
123
137
  - !ruby/object:Gem::Version
124
138
  version: '2.14'
125
139
  description: Asynchronous web crawler, scraper and file harvester
@@ -134,6 +148,7 @@ files:
134
148
  - bin/rr
135
149
  - lib/retriever.rb
136
150
  - lib/retriever/cli.rb
151
+ - lib/retriever/core_ext.rb
137
152
  - lib/retriever/fetch.rb
138
153
  - lib/retriever/fetchfiles.rb
139
154
  - lib/retriever/fetchseo.rb
@@ -141,6 +156,7 @@ files:
141
156
  - lib/retriever/link.rb
142
157
  - lib/retriever/openuri_redirect_patch.rb
143
158
  - lib/retriever/page.rb
159
+ - lib/retriever/page_iterator.rb
144
160
  - lib/retriever/target.rb
145
161
  - lib/retriever/version.rb
146
162
  - readme.md
@@ -159,17 +175,17 @@ require_paths:
159
175
  - lib
160
176
  required_ruby_version: !ruby/object:Gem::Requirement
161
177
  requirements:
162
- - - '>='
178
+ - - ">="
163
179
  - !ruby/object:Gem::Version
164
- version: 1.8.6
180
+ version: 2.0.0
165
181
  required_rubygems_version: !ruby/object:Gem::Requirement
166
182
  requirements:
167
- - - '>='
183
+ - - ">="
168
184
  - !ruby/object:Gem::Version
169
185
  version: 1.3.6
170
186
  requirements: []
171
187
  rubyforge_project: rubyretriever
172
- rubygems_version: 2.2.2
188
+ rubygems_version: 2.3.0
173
189
  signing_key:
174
190
  specification_version: 4
175
191
  summary: Ruby Web Crawler & File Harvester