rubyretriever 1.2.4 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a5749cf55198f97bab6c77297bf6409a2518bca0
4
- data.tar.gz: df792b6d3b1d03a8b70faadf651e20779f4fd1e8
3
+ metadata.gz: 05f8e6c0169af87c8284c8b6e98d5f25488b0980
4
+ data.tar.gz: a45a361b215b5ae7832e762b08bbdb989d0847a1
5
5
  SHA512:
6
- metadata.gz: 76e12598873e8779e196f84ec040a5c9fef01e410c884fa88b18f41335848a69dea778459bbb9629f941dbe12c8ab2c57032d24007a1f4fd57c5caec3a76abea
7
- data.tar.gz: 13b91c96ce17eb8ce802250505943d13efbd292cd95b120336ddd0ddd98278c513310c68eb9606a92b4018ef6ec9369369ee5869eff0abb589457c7b3bde41c3
6
+ metadata.gz: 8cee32f96e0ea0fe003a109016c6b17f9ddde9a73d72dbfd0a95c63e413b87b41c2ccf5bbc86b9886f78f59f053fbcd27aad0cbbbededbe8b002f0f7d986c528
7
+ data.tar.gz: fd0762069a69f7383a59b4058b46bde46793defec437eb1525e69f98f60e4429c1dc676d2d8ae3fa828c8d82d2018a9e9685bb15537424cb3e15fe0d5c472ade
data/bin/rr CHANGED
@@ -32,7 +32,7 @@ optparse = OptionParser.new do |opts|
32
32
  opts.on('-p', '--progress', 'Output progress bar') do
33
33
  options['progress'] = true
34
34
  end
35
- options['maxpages'] = false
35
+ options['maxpages'] = 100
36
36
  opts.on('-l',
37
37
  '--limit PAGE_LIMIT_#',
38
38
  'set a max on the total number of crawled pages') do |maxp|
@@ -0,0 +1,13 @@
1
+ require 'htmlentities'
2
+ #
3
+ module SourceString
4
+ refine String do
5
+ def decode_html
6
+ HTMLEntities.new.decode(self)
7
+ end
8
+
9
+ def encode_utf8_and_replace
10
+ encode('UTF-8', invalid: :replace, undef: :replace)
11
+ end
12
+ end
13
+ end
@@ -15,6 +15,7 @@ module Retriever
15
15
  # There is no direct output
16
16
  # this is a parent class that the other fetch classes build off of.
17
17
  def initialize(url, options)
18
+ @iterator = false
18
19
  @result = []
19
20
  @connection_tally = {
20
21
  success: 0,
@@ -27,6 +28,9 @@ module Retriever
27
28
  @t = Retriever::Target.new(url, @file_re)
28
29
  @output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
29
30
  @already_crawled = setup_bloom_filter
31
+ end
32
+
33
+ def start
30
34
  @page_one = crawl_page_one
31
35
  @link_stack = create_link_stack
32
36
  @temp_link_stack = []
@@ -80,6 +84,39 @@ module Retriever
80
84
  puts
81
85
  end
82
86
 
87
+ # returns true is resp is ok to continue
88
+ def good_response?(resp, url)
89
+ return false unless resp
90
+ hdr = resp.response_header
91
+ if hdr.redirection?
92
+ loc = hdr.location
93
+ lg("#{url} Redirected to #{loc}")
94
+ if t.host_re =~ loc
95
+ @temp_link_stack.push(loc) unless @already_crawled.include?(loc)
96
+ lg('--Added to stack for later')
97
+ return false
98
+ end
99
+ lg("Redirection outside of target host. No - go. #{loc}")
100
+ return false
101
+ end
102
+ # lets not continue if unsuccessful connection
103
+ unless hdr.successful?
104
+ lg("UNSUCCESSFUL CONNECTION -- #{url}")
105
+ @connection_tally[:error] += 1
106
+ @connection_tally[:error_server] += 1 if hdr.server_error?
107
+ @connection_tally[:error_client] += 1 if hdr.client_error?
108
+ return false
109
+ end
110
+ # let's not continue if not text/html
111
+ unless hdr['CONTENT_TYPE'] =~ %r{(text/html|application/xhtml+xml)}
112
+ @already_crawled.insert(url)
113
+ lg("Page Not text/html -- #{url}")
114
+ return false
115
+ end
116
+ @connection_tally[:success] += 1
117
+ true
118
+ end
119
+
83
120
  private
84
121
 
85
122
  def setup_options(options)
@@ -119,7 +156,7 @@ module Retriever
119
156
  end
120
157
 
121
158
  def crawl_page_one
122
- page_one = Retriever::Page.new(@t.source, @t)
159
+ page_one = Retriever::Page.new(@t.target, @t.source, @t)
123
160
  lg("URL Crawled: #{@t.target}")
124
161
  page_one
125
162
  end
@@ -141,13 +178,13 @@ module Retriever
141
178
 
142
179
  # iterates over the existing @link_stack
143
180
  # running until we reach the @max_pages value.
144
- def async_crawl_and_collect
181
+ def async_crawl_and_collect(&block)
145
182
  while @already_crawled.size < @max_pages
146
183
  if @link_stack.empty?
147
184
  end_crawl_notice
148
185
  break
149
186
  end
150
- new_links_arr = process_link_stack
187
+ new_links_arr = process_link_stack(&block)
151
188
  @temp_link_stack = []
152
189
  next if new_links_arr.nil? || new_links_arr.empty?
153
190
  @link_stack.concat(new_links_arr)
@@ -157,47 +194,14 @@ module Retriever
157
194
  @result.uniq!
158
195
  end
159
196
 
160
- # returns true is resp is ok to continue
161
- def good_response?(resp, url)
162
- return false unless resp
163
- hdr = resp.response_header
164
- if hdr.redirection?
165
- loc = hdr.location
166
- lg("#{url} Redirected to #{loc}")
167
- if t.host_re =~ loc
168
- @temp_link_stack.push(loc) unless @already_crawled.include?(loc)
169
- lg('--Added to stack for later')
170
- return false
171
- end
172
- lg("Redirection outside of target host. No - go. #{loc}")
173
- return false
174
- end
175
- # lets not continue if unsuccessful connection
176
- unless hdr.successful?
177
- lg("UNSUCCESSFUL CONNECTION -- #{url}")
178
- @connection_tally[:error] += 1
179
- @connection_tally[:error_server] += 1 if hdr.server_error?
180
- @connection_tally[:error_client] += 1 if hdr.client_error?
181
- return false
182
- end
183
- # let's not continue if not text/html
184
- unless hdr['CONTENT_TYPE'].include?('text/html')
185
- @already_crawled.insert(url)
186
- lg("Page Not text/html -- #{url}")
187
- return false
188
- end
189
- @connection_tally[:success] += 1
190
- true
191
- end
192
-
193
- def push_seo_to_data(url, new_page)
197
+ def push_seo_to_result(url, new_page)
194
198
  seos = [url]
195
199
  seos.concat(new_page.parse_seo)
196
200
  @result.push(seos)
197
201
  lg('--page SEO scraped')
198
202
  end
199
203
 
200
- def push_files_to_data(new_page)
204
+ def push_files_to_result(new_page)
201
205
  filez = new_page.parse_files(new_page.parse_internal)
202
206
  @result.concat(filez) unless filez.empty?
203
207
  lg("--#{filez.size} files found")
@@ -209,7 +213,7 @@ module Retriever
209
213
  if @progress && (@already_crawled.size < @max_pages)
210
214
  @progressbar.increment
211
215
  end
212
- Retriever::Page.new(response, @t)
216
+ Retriever::Page.new(url, response, @t)
213
217
  end
214
218
 
215
219
  def new_visitable_links(current_page)
@@ -217,10 +221,16 @@ module Retriever
217
221
  current_page.parse_internal_visitable
218
222
  end
219
223
 
224
+ def push_custom_to_result(url, current_page, &block)
225
+ data = block.call current_page
226
+ @result.push(data) unless data.empty?
227
+ lg("-- PageIterator called on: #{url}")
228
+ end
229
+
220
230
  # send a new wave of GET requests, using current @link_stack
221
231
  # at end of the loop it empties link_stack
222
232
  # puts new links into temporary stack
223
- def process_link_stack
233
+ def process_link_stack(&block)
224
234
  EM.synchrony do
225
235
  concurrency = 10
226
236
  EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
@@ -230,20 +240,19 @@ module Retriever
230
240
  next unless good_response?(resp, url)
231
241
  current_page = page_from_response(url, resp.response)
232
242
  # non-link dependent modes
233
- push_seo_to_data(url, current_page) if @seo
243
+ push_seo_to_result(url, current_page) if @seo
244
+ push_custom_to_result(url, current_page, &block) if @iterator
234
245
  next unless current_page.links.size > 0
235
246
  @temp_link_stack.push(new_visitable_links(current_page))
236
247
  # link dependent modes
237
248
  next unless @fileharvest
238
- push_files_to_data(current_page)
249
+ push_files_to_result(current_page)
239
250
  end
240
251
  EventMachine.stop
241
252
  end
242
253
  # empty the stack. most clean way
243
254
  @link_stack = []
244
255
  # temp contains redirects + new visitable links
245
- # we will re-initialize it as empty right after this function
246
- # in the parent method 'async crawl and collect'
247
256
  @temp_link_stack.flatten.uniq!
248
257
  end
249
258
  end
@@ -5,6 +5,7 @@ module Retriever
5
5
  class FetchFiles < Fetch
6
6
  def initialize(url, options)
7
7
  super
8
+ start
8
9
  temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
9
10
  @result.concat(temp_file_collection) if temp_file_collection.size > 0
10
11
  lg("#{@result.size} new files found")
@@ -6,6 +6,7 @@ module Retriever
6
6
  # on all unique pages found on the site
7
7
  def initialize(url, options)
8
8
  super
9
+ start
9
10
  @result.push(@page_one.parse_seo)
10
11
 
11
12
  async_crawl_and_collect
@@ -5,6 +5,7 @@ module Retriever
5
5
  # returns an array of all unique pages found on the site
6
6
  def initialize(url, options)
7
7
  super
8
+ start
8
9
  @result.push(@t.target)
9
10
  @result.concat(@link_stack)
10
11
 
@@ -1,5 +1,6 @@
1
1
  require 'addressable/uri'
2
-
2
+ #
3
+ using SourceString
3
4
  module Retriever
4
5
  #
5
6
  class Page
@@ -30,11 +31,12 @@ module Retriever
30
31
  wmv|flv|mp3|wav|doc|txt|ico|xml)
31
32
  /ix).freeze
32
33
 
33
- attr_reader :links, :source, :t
34
+ attr_reader :links, :source, :t, :url
34
35
 
35
- def initialize(source, t)
36
+ def initialize(url, source, t)
37
+ @url = url
36
38
  @t = t
37
- @source = source.encode('UTF-8', invalid: :replace, undef: :replace)
39
+ @source = source.encode_utf8_and_replace
38
40
  @links = nil
39
41
  end
40
42
 
@@ -59,24 +61,24 @@ module Retriever
59
61
  parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
60
62
  end
61
63
 
62
- def parse_files(arr)
64
+ def parse_files(arr = parse_internal)
63
65
  arr.select { |x| @t.file_re =~ x }
64
66
  end
65
67
 
66
68
  def title
67
- TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ''
69
+ TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
68
70
  end
69
71
 
70
72
  def desc
71
- DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ''
73
+ DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : ''
72
74
  end
73
75
 
74
76
  def h1
75
- H1_RE =~ @source ? @source.match(H1_RE)[1] : ''
77
+ H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : ''
76
78
  end
77
79
 
78
80
  def h2
79
- H2_RE =~ @source ? @source.match(H2_RE)[1] : ''
81
+ H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : ''
80
82
  end
81
83
 
82
84
  def parse_seo
@@ -0,0 +1,21 @@
1
+ module Retriever
2
+ #
3
+ class PageIterator < Fetch
4
+ # recieves target url and RR options, and a block
5
+ # runs the block on all pages during crawl, pushing
6
+ # the returned value of the block onto a result stack
7
+ # the complete data returned from the crawl is accessible thru self.result
8
+ def initialize(url, options, &block)
9
+ super
10
+ start
11
+ fail 'block required for PageIterator' unless block_given?
12
+ @iterator = true
13
+ @result.push(block.call @page_one)
14
+ lg("-- PageIterator crawled- #{url}")
15
+ async_crawl_and_collect(&block)
16
+ # done, make sure progress bar says we are done
17
+ @progressbar.finish if @progress
18
+ @result.sort_by! { |x| x.length } if @result.size > 1
19
+ end
20
+ end
21
+ end
@@ -1,4 +1,4 @@
1
1
  #
2
2
  module Retriever
3
- VERSION = '1.2.4'
3
+ VERSION = '1.3.0'
4
4
  end
data/lib/retriever.rb CHANGED
@@ -1,7 +1,9 @@
1
+ require 'retriever/core_ext'
1
2
  require 'retriever/fetch'
2
3
  require 'retriever/fetchfiles'
3
4
  require 'retriever/fetchsitemap'
4
5
  require 'retriever/fetchseo'
6
+ require 'retriever/page_iterator'
5
7
  require 'retriever/cli'
6
8
  require 'retriever/link'
7
9
  require 'retriever/target'
data/readme.md CHANGED
@@ -25,7 +25,8 @@ Features
25
25
 
26
26
  Use cases
27
27
  ---------
28
- RubyRetriever can do multiple things for you. With a single command at the terminal, RR can:
28
+ RubyRetriever can do multiple things for you. As an Executable
29
+ With a single command at the terminal, RR can:
29
30
  1. Crawl your website and output a *valid XML sitemap* based on what it found.
30
31
  2. Crawl a target website and *download all files of a given filetype*.
31
32
  3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
@@ -36,41 +37,44 @@ Getting started
36
37
  -----------
37
38
  Install the gem
38
39
  ```sh
39
- gem install rubyretriever
40
+ $ gem install rubyretriever
40
41
  ```
42
+
41
43
 
44
+ Using the Executable
45
+ --------------------
42
46
  **Example: Sitemap mode**
43
47
  ```sh
44
- rr --sitemap CSV --progress --limit 100 http://www.cnet.com
48
+ $ rr --sitemap CSV --progress --limit 10 http://www.cnet.com
45
49
  ```
46
50
  OR -- SAME COMMAND
47
51
  ```sh
48
- rr -s csv -p -l 100 http://www.cnet.com
52
+ $ rr -s csv -p -l 10 http://www.cnet.com
49
53
  ```
50
54
 
51
- This would map http://www.cnet.com until it crawled a max of 100 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
55
+ This would map http://www.cnet.com until it crawled a max of 10 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
52
56
 
53
57
  **Example: File Harvesting mode**
54
58
  ```sh
55
- rr --files pdf --progress --limit 1000 --out hubspot http://www.hubspot.com
59
+ $ rr --files txt --verbose --limit 1 http://textfiles.com/programming/
56
60
  ```
57
61
  OR -- SAME COMMAND
58
62
  ```sh
59
- rr -f pdf -p -l 100 http://www.hubspot.com
63
+ $ rr -f txt -v -l 1 http://textfiles.com/programming/
60
64
  ```
61
65
 
62
- This would crawl http://www.hubspot.com looking for filetype:PDF until it hit a max of 100 pages, then write out a list of filepaths to a CSV named hubspot (based on the website host name). Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
66
+ This would crawl http://textfiles.com/programming/ looking for txt files for only a single page, then write out a list of filepaths to txt files to the terminal. Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
63
67
 
64
68
  **Example: SEO mode**
65
69
  ```sh
66
- rr --seo --progress --limit 100 --out cnet-seo http://www.cnet.com
70
+ $ rr --seo --progress --limit 10 --out cnet-seo http://www.cnet.com
67
71
  ```
68
72
  OR -- SAME COMMAND
69
73
  ```sh
70
- rr -e -p -l 10 -o cnet-seo http://www.cnet.com
74
+ $ rr -e -p -l 10 -o cnet-seo http://www.cnet.com
71
75
  ```
72
76
 
73
- This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
77
+ This would go to http://www.cnet.com and crawl a max of 10 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
74
78
 
75
79
 
76
80
  command-line arguments
@@ -89,11 +93,48 @@ and OPTIONS is the applicable:
89
93
  -l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
90
94
  -h, --help *Display this screen*
91
95
 
96
+
97
+ Using as a Library (starting as of version 1.3.0 -- yet to be released)
98
+ ------------------
99
+
100
+ If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.
101
+
102
+ Sample Script using **PageIterator**
103
+ ```ruby
104
+ require 'retriever'
105
+ opts = {
106
+ 'maxpages' => 1
107
+ }
108
+ t = Retriever::PageIterator.new('http://www.basecamp.com', opts) do |page|
109
+ [page.url, page.title]
110
+ end
111
+ puts t.result.to_s
112
+ ```
113
+
114
+ ```sh
115
+ >> [["http://www.basecamp.com", "Basecamp is everyone’s favorite project management app."]]
116
+ ```
117
+ Available methods on the page iterator:
118
+ * **#url** - returns full URL of current page
119
+ * **#source** - returns raw page source code
120
+ * **#title** - returns html decoded verson of curent page title
121
+ * **#desc** - returns html decoded verson of curent page meta description
122
+ * **#h1** - returns html decoded verson of current page's h1 tag
123
+ * **#h2** - returns html decoded verson of current page's h2 tag
124
+ * **#links** - returns array of all links on the page
125
+ * **#parse_internal** - returns array of current page's internal (same host) links
126
+ * **#parse_internal_visitable** - returns #parse_internal plus added filtering of only links that are visitable
127
+ * **#parse_seo** - returns array of current page's html decoded title, desc, h1 and h2
128
+ * **#parse_files** - returns array of downloaded files of type supplied as RR options (fileharvest options)
129
+
130
+
92
131
  Current Requirements
93
132
  ------------
94
133
  em-synchrony
95
134
  ruby-progressbar
96
135
  bloomfilter-rb
136
+ addressable
137
+ htmlentities
97
138
 
98
139
  License
99
140
  -------
data/spec/link_spec.rb CHANGED
@@ -3,7 +3,7 @@ require 'retriever'
3
3
  describe 'Link' do
4
4
 
5
5
  t = Retriever::Target.new('http://www.cnet.com/reviews/')
6
- let(:links) { Retriever::Page.new(@source, t).links }
6
+ let(:links) { Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links }
7
7
 
8
8
  it 'collects links in anchor tags' do
9
9
  @source = (<<SOURCE).strip
data/spec/page_spec.rb CHANGED
@@ -4,9 +4,18 @@ require 'retriever/fetch'
4
4
  t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
5
5
 
6
6
  describe 'Page' do
7
+ describe '#url' do
8
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
9
+ it 'returns current page URL' do
10
+ @source = (<<SOURCE).strip
11
+ <a href='http://www.cnet.com/'>download</a>
12
+ SOURCE
13
+ expect(page.url).to eq('http://www.cnet.com/')
14
+ end
15
+ end
7
16
 
8
17
  describe '#links' do
9
- let(:links) { Retriever::Page.new(@source, t).links }
18
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
10
19
  it 'collects all unique href links on the page' do
11
20
  @source = (<<SOURCE).strip
12
21
  <a href='www.cnet.com/download.exe'>download</a>
@@ -17,12 +26,12 @@ describe 'Page' do
17
26
  <a href='http://www.yahoo.com/test/'>yahoo</a>
18
27
  SOURCE
19
28
 
20
- expect(links.size).to eq(4)
29
+ expect(page.links.size).to eq(4)
21
30
  end
22
31
  end
23
32
 
24
33
  describe '#parse_internal' do
25
- let(:page) { Retriever::Page.new(@source, t) }
34
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
26
35
  let(:links) { page.parse_internal }
27
36
  it 'filters links by host' do
28
37
  @source = (<<SOURCE).strip
@@ -35,7 +44,7 @@ SOURCE
35
44
  end
36
45
 
37
46
  describe '#parse_internal_visitable' do
38
- let(:page) { Retriever::Page.new(@source, t) }
47
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
39
48
  let(:links) { page.parse_internal_visitable }
40
49
  it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
41
50
  @source = (<<SOURCE).strip
@@ -46,7 +55,7 @@ SOURCE
46
55
  end
47
56
 
48
57
  describe '#parse_files' do
49
- let(:page) { Retriever::Page.new(@source, t) }
58
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
50
59
  let(:files) { page.parse_files(page.parse_internal) }
51
60
  it 'filters links by filetype' do
52
61
  @source = (<<SOURCE).strip
@@ -59,7 +68,7 @@ SOURCE
59
68
  end
60
69
 
61
70
  describe '#title' do
62
- let(:page) { Retriever::Page.new(@source, t) }
71
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
63
72
  it 'returns page title' do
64
73
  @source = (<<SOURCE).strip
65
74
  <title>test</title>
@@ -68,7 +77,7 @@ SOURCE
68
77
  end
69
78
  end
70
79
  describe '#desc' do
71
- let(:page) { Retriever::Page.new(@source, t) }
80
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
72
81
  it 'returns meta description' do
73
82
  @source = (<<SOURCE).strip
74
83
  <meta name='description' content="test2 ">
@@ -77,7 +86,7 @@ SOURCE
77
86
  end
78
87
  end
79
88
  describe '#h1' do
80
- let(:page) { Retriever::Page.new(@source, t) }
89
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
81
90
  it 'returns h1 text' do
82
91
  @source = (<<SOURCE).strip
83
92
  <h1>test 3</h1>
@@ -86,7 +95,7 @@ SOURCE
86
95
  end
87
96
  end
88
97
  describe '#h2' do
89
- let(:page) { Retriever::Page.new(@source, t) }
98
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', @source, t) }
90
99
  it 'returns h2 text' do
91
100
  @source = (<<SOURCE).strip
92
101
  <h2> test 4 </h2>
@@ -1,4 +1,69 @@
1
1
  require 'retriever'
2
2
 
3
3
  describe 'Fetch' do
4
+ describe '#good_response?' do
5
+ let(:r) do
6
+ Retriever::Fetch.new('http://www.yahoo.com', {})
7
+ end
8
+
9
+ let(:resp) do
10
+ {}
11
+ end
12
+
13
+ let(:nil_response) do
14
+ r.good_response?(nil,'http://www.yahoo.com')
15
+ end
16
+
17
+ let(:unsuccessful_resp) do
18
+ resp.stub(:response_header).and_return(resp)
19
+ resp.stub(:redirection?).and_return(false)
20
+ resp.stub(:successful?).and_return(false)
21
+ resp.stub(:server_error?).and_return(false)
22
+ resp.stub(:client_error?).and_return(false)
23
+ r.good_response?(resp,'http://www.yahoo.com')
24
+ end
25
+
26
+ let(:redir_resp) do
27
+ resp.stub(:response_header).and_return(resp)
28
+ resp.stub(:redirection?).and_return(true)
29
+ resp.stub(:location).and_return('http://www.google.com')
30
+ r.good_response?(resp,'http://www.yahoo.com')
31
+ end
32
+
33
+ let(:bad_content_type_resp) do
34
+ resp.stub(:response_header).and_return(resp)
35
+ resp.stub(:redirection?).and_return(false)
36
+ resp.stub(:successful?).and_return(true)
37
+ resp['CONTENT_TYPE'] = 'image/jpeg'
38
+ r.good_response?(resp,'http://www.yahoo.com')
39
+ end
40
+
41
+ let(:success_resp) do
42
+ resp.stub(:response_header).and_return(resp)
43
+ resp.stub(:redirection?).and_return(false)
44
+ resp.stub(:successful?).and_return(true)
45
+ resp['CONTENT_TYPE'] = 'text/html'
46
+ r.good_response?(resp,'http://www.yahoo.com')
47
+ end
48
+
49
+ it 'returns false if the response is empty' do
50
+ expect(nil_response).to eq(false)
51
+ end
52
+
53
+ it 'returns false on unsuccessful connection' do
54
+ expect(unsuccessful_resp).to eq(false)
55
+ end
56
+
57
+ it 'returns false on redirecting host' do
58
+ expect(redir_resp).to eq(false)
59
+ end
60
+
61
+ it 'returns false on non-visitable content type' do
62
+ expect(bad_content_type_resp).to eq(false)
63
+ end
64
+
65
+ it 'returns true otherwise' do
66
+ expect(success_resp).to eq(true)
67
+ end
68
+ end
4
69
  end
data/spec/target_spec.rb CHANGED
@@ -1,9 +1,10 @@
1
1
  require 'retriever'
2
2
  require 'open-uri'
3
3
 
4
- t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
5
-
6
4
  describe 'Target' do
5
+ let(:t) do
6
+ Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
7
+ end
7
8
 
8
9
  it 'creates target var' do
9
10
  expect(t.target).to eq('http://www.cnet.com/reviews/')
@@ -30,6 +31,9 @@ describe 'Target' do
30
31
  end
31
32
 
32
33
  describe '#source' do
34
+ let(:redirecting_url) do
35
+ Retriever::Target.new('http://software-by-joe.appspot.com').source
36
+ end
33
37
 
34
38
  it 'opens URL and returns source as String' do
35
39
  expect(Retriever::Target.new('http://techcrunch.com/').source.class)
@@ -37,8 +41,7 @@ describe 'Target' do
37
41
  end
38
42
 
39
43
  it 'fails if target redirects to new host' do
40
- expect { Retriever::Target.new('http://tinyurl.com/nkfkypa').source }
41
- .to raise_error
44
+ expect { redirecting_url }.to raise_error
42
45
  end
43
46
  end
44
47
  end
metadata CHANGED
@@ -1,125 +1,139 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.4
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-06-16 00:00:00.000000000 Z
11
+ date: 2014-06-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - '>='
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - '>='
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: em-http-request
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: ruby-progressbar
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '>='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: bloomfilter-rb
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: addressable
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - '>='
73
+ - - ">="
74
74
  - !ruby/object:Gem::Version
75
75
  version: '0'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - '>='
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: htmlentities
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
81
95
  - !ruby/object:Gem::Version
82
96
  version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: bundler
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ~>
101
+ - - "~>"
88
102
  - !ruby/object:Gem::Version
89
103
  version: '1.6'
90
104
  type: :development
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - ~>
108
+ - - "~>"
95
109
  - !ruby/object:Gem::Version
96
110
  version: '1.6'
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: rake
99
113
  requirement: !ruby/object:Gem::Requirement
100
114
  requirements:
101
- - - ~>
115
+ - - "~>"
102
116
  - !ruby/object:Gem::Version
103
117
  version: '10.3'
104
118
  type: :development
105
119
  prerelease: false
106
120
  version_requirements: !ruby/object:Gem::Requirement
107
121
  requirements:
108
- - - ~>
122
+ - - "~>"
109
123
  - !ruby/object:Gem::Version
110
124
  version: '10.3'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: rspec
113
127
  requirement: !ruby/object:Gem::Requirement
114
128
  requirements:
115
- - - ~>
129
+ - - "~>"
116
130
  - !ruby/object:Gem::Version
117
131
  version: '2.14'
118
132
  type: :development
119
133
  prerelease: false
120
134
  version_requirements: !ruby/object:Gem::Requirement
121
135
  requirements:
122
- - - ~>
136
+ - - "~>"
123
137
  - !ruby/object:Gem::Version
124
138
  version: '2.14'
125
139
  description: Asynchronous web crawler, scraper and file harvester
@@ -134,6 +148,7 @@ files:
134
148
  - bin/rr
135
149
  - lib/retriever.rb
136
150
  - lib/retriever/cli.rb
151
+ - lib/retriever/core_ext.rb
137
152
  - lib/retriever/fetch.rb
138
153
  - lib/retriever/fetchfiles.rb
139
154
  - lib/retriever/fetchseo.rb
@@ -141,6 +156,7 @@ files:
141
156
  - lib/retriever/link.rb
142
157
  - lib/retriever/openuri_redirect_patch.rb
143
158
  - lib/retriever/page.rb
159
+ - lib/retriever/page_iterator.rb
144
160
  - lib/retriever/target.rb
145
161
  - lib/retriever/version.rb
146
162
  - readme.md
@@ -159,17 +175,17 @@ require_paths:
159
175
  - lib
160
176
  required_ruby_version: !ruby/object:Gem::Requirement
161
177
  requirements:
162
- - - '>='
178
+ - - ">="
163
179
  - !ruby/object:Gem::Version
164
- version: 1.8.6
180
+ version: 2.0.0
165
181
  required_rubygems_version: !ruby/object:Gem::Requirement
166
182
  requirements:
167
- - - '>='
183
+ - - ">="
168
184
  - !ruby/object:Gem::Version
169
185
  version: 1.3.6
170
186
  requirements: []
171
187
  rubyforge_project: rubyretriever
172
- rubygems_version: 2.2.2
188
+ rubygems_version: 2.3.0
173
189
  signing_key:
174
190
  specification_version: 4
175
191
  summary: Ruby Web Crawler & File Harvester