rubyretriever 0.1.4 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 924c9958e88587353cc80f4b134cca91f73f3e57
4
- data.tar.gz: 393457cd37ad3fb372008a7829c8028f658f2b58
3
+ metadata.gz: 80eba5c4fdf8c33a19ca35ba37b4493cf44ab57a
4
+ data.tar.gz: 4dbca842c4f56060e13cfe1c0acf0256321df573
5
5
  SHA512:
6
- metadata.gz: 5dcde12eb9fea2181b6a91c0d798351b78efa80652547afd02db536ab9d139de2969b08326d61363435baf04cc4036a0ed0a4cbdd9c884bd05314b8210c38938
7
- data.tar.gz: 8c226a13d4e0b29beffc1940b6ca05ff9f4ae403decc1990a2a6418f90fca12e132852c48eea082918d74cb593006e98ed14ffbe9366d9dd64ef0f058eefd7a2
6
+ metadata.gz: 48181c41247d85b16db74eb8b7c0a74c23c9740d00d2fd79ecfdb8435efa64d81fb34ccbc81e32911d7fe0e6942c6f7c7c9f91d39feeb434ab078f659ada1341
7
+ data.tar.gz: 3dace96b1bd42fa2292e8a9db3506983d688e7877b12ca30e550a81be9bad6a7de9907eb2f2b75c6263146a8721d08437dfabe06a493de057dc8a764b57c3a39
data/bin/rr CHANGED
@@ -1,18 +1,24 @@
1
1
  #! /usr/bin/env ruby
2
+
2
3
  require 'retriever'
3
4
  require 'optparse'
5
+
4
6
  options = {}
5
7
  optparse = OptionParser.new do|opts|
6
8
  # Set a banner, displayed at the top
7
9
  # of the help screen.
8
10
  opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
9
11
  options[:sitemap] = false
10
- opts.on( '-s', '--sitemap FORMAT', 'MODE FLAG: Sitemap mode - Crawl site and output sitemap, format choices: CSV or XML' ) do |output_type|
11
- options[:sitemap] = output_type
12
+ opts.on( '-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode' ) do |output_type|
13
+ options[:sitemap] = output_type||''
12
14
  end
13
15
  options[:fileharvest] = false
14
- opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode - Crawl site and collect links for files found, extension for filetype' ) do |file_ext|
16
+ opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode' ) do |file_ext|
15
17
  options[:fileharvest] = file_ext
18
+ end
19
+ options[:seo] = false
20
+ opts.on( '-e', '--seo', 'MODE FLAG: SEO mode' ) do
21
+ options[:seo] = true
16
22
  end
17
23
  options[:filename] = nil
18
24
  opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
@@ -56,14 +62,14 @@ ARGV.each do|q|
56
62
  puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
57
63
  puts "### Performing File Harvest" if options[:fileharvest]
58
64
  puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
65
+ puts "### Performing SEO Scrape" if options[:seo]
59
66
  puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
60
67
  puts "### Being verbose"
61
- puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
68
+ puts "### Stopping after #{options[:maxpages]} pages"
62
69
  end
63
70
  puts "###############################"
64
71
  puts "### [RubyRetriever] go fetch #{q}"
65
- Retriever::FetchFiles.new(q, options) if options[:fileharvest]
66
- Retriever::FetchSitemap.new(q, options) if options[:sitemap]
72
+ Retriever::CLI.new(q, options)
67
73
  puts "### [RubyRetriever] is done."
68
74
  puts "###############################"
69
75
  puts
@@ -0,0 +1,27 @@
1
+ module Retriever
2
+ class CLI
3
+ def initialize(url,options)
4
+
5
+ #kick off the fetch mode of choice
6
+ if options[:fileharvest]
7
+ @fetch = Retriever::FetchFiles.new(url, options)
8
+ elsif options[:sitemap]
9
+ @fetch = Retriever::FetchSitemap.new(url, options)
10
+ elsif options[:seo]
11
+ @fetch = Retriever::FetchSEO.new(url, options)
12
+ else
13
+ fail "### Error: No Mode Selected"
14
+ end
15
+
16
+ #all fetch modes
17
+ @fetch.dump
18
+ @fetch.write if options[:filename]
19
+
20
+ #fileharvest only
21
+ @fetch.autodownload if options[:autodown] && options[:fileharvest]
22
+
23
+ #sitemap only
24
+ @fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
25
+ end
26
+ end
27
+ end
@@ -9,12 +9,14 @@ require 'bloomfilter-rb'
9
9
  module Retriever
10
10
  class Fetch
11
11
  attr_reader :maxPages, :t
12
- #constants
13
- HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
14
- NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
15
12
 
16
13
  def initialize(url,options)
17
- @t = Retriever::Target.new(url)
14
+ @connection_tally = {
15
+ :success => 0,
16
+ :error => 0,
17
+ :error_client => 0,
18
+ :error_server => 0
19
+ }
18
20
  #OPTIONS
19
21
  @prgrss = options[:progress] ? options[:progress] : false
20
22
  @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
@@ -23,6 +25,7 @@ module Retriever
23
25
  @fh = options[:fileharvest] ? options[:fileharvest] : false
24
26
  @file_ext = @fh.to_s
25
27
  @s = options[:sitemap] ? options[:sitemap] : false
28
+ @seo = options[:seo] ? true : false
26
29
  @autodown = options[:autodown] ? true : false
27
30
  #
28
31
  if @fh
@@ -30,9 +33,6 @@ module Retriever
30
33
  @file_re = Regexp.new(tempExtStr).freeze
31
34
  else
32
35
  errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
33
- if !@output
34
- @output = "rr-#{@t.host.split('.')[1]}"
35
- end
36
36
  end
37
37
  if @prgrss
38
38
  errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
@@ -44,8 +44,13 @@ module Retriever
44
44
  }
45
45
  @progressbar = ProgressBar.create(prgressVars)
46
46
  end
47
+ @t = Retriever::Target.new(url,@file_re)
47
48
  @already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
48
49
  @already_crawled.insert(@t.target)
50
+ if (@fh && !@output)
51
+ @output = "rr-#{@t.host.split('.')[1]}"
52
+ end
53
+ fail "bad page source on target -- try HTTPS?" if !@t.source
49
54
  end
50
55
  def errlog(msg)
51
56
  raise "ERROR: #{msg}"
@@ -53,52 +58,52 @@ module Retriever
53
58
  def lg(msg)
54
59
  puts "### #{msg}" if @v
55
60
  end
56
- def dump(data)
61
+ def dump
57
62
  puts "###############################"
63
+ if @v
64
+ puts "Connection Tally:"
65
+ puts @connection_tally.to_s
66
+ puts "###############################"
67
+ end
58
68
  if @s
59
69
  puts "#{@t.target} Sitemap"
60
- puts "Page Count: #{data.size}"
70
+ puts "Page Count: #{@data.size}"
61
71
  elsif @fh
62
72
  puts "Target URL: #{@t.target}"
63
73
  puts "Filetype: #{@file_ext}"
64
- puts "File Count: #{data.size}"
74
+ puts "File Count: #{@data.size}"
75
+ elsif @seo
76
+ puts "#{@t.target} SEO Metrics"
77
+ puts "Page Count: #{@data.size}"
65
78
  else
66
- puts "ERROR"
79
+ fail "ERROR - Cannot dump - Mode Not Found"
67
80
  end
68
81
  puts "###############################"
69
- puts data
82
+ @data.each do |line|
83
+ puts line
84
+ end
70
85
  puts "###############################"
71
86
  puts
72
87
  end
73
- def write(data)
88
+ def write
74
89
  if @output
90
+ i = 0
75
91
  CSV.open("#{@output}.csv", "w") do |csv|
76
- data.each do |entry|
77
- csv << [entry]
78
- end
92
+ if ((i == 0) && @seo)
93
+ csv << ['URL','Page Title','Meta Description','H1','H2']
94
+ i +=1
95
+ end
96
+ @data.each do |entry|
97
+ csv << entry
98
+ end
79
99
  end
80
100
  puts "###############################"
81
101
  puts "File Created: #{@output}.csv"
82
- puts "Object Count: #{data.size}"
102
+ puts "Object Count: #{@data.size}"
83
103
  puts "###############################"
84
104
  puts
85
105
  end
86
106
  end
87
- #recieves page source as string
88
- #returns array of unique href links
89
- def fetchLinks(doc)
90
- return false if !doc
91
- doc.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
92
- link = match[0]
93
- Link.new(@t.host, link).path
94
- end.uniq
95
- end
96
- def parseInternalLinks(all_links)
97
- all_links.select{ |linky| (@t.host_re =~ linky) }
98
- end
99
- def parseInternalVisitableLinks(all_links)
100
- parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
101
- end
102
107
  def async_crawl_and_collect()
103
108
  while (@already_crawled.size < @maxPages)
104
109
  if @linkStack.empty?
@@ -112,11 +117,41 @@ module Retriever
112
117
  new_links_arr = self.asyncGetWave()
113
118
  next if (new_links_arr.nil? || new_links_arr.empty?)
114
119
  new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
115
- @linkStack.concat(new_links_arr)
116
- @sitemap.concat(new_links_arr) if @s
120
+ @linkStack.concat(new_links_arr).uniq!
121
+ @data.concat(new_links_arr) if @s
117
122
  end
118
123
  @progressbar.finish if @prgrss
119
124
  end
125
+ def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
126
+ return false if !resp
127
+ if resp.response_header.redirection? #we got redirected
128
+ loc = resp.response_header.location
129
+ lg("#{url} Redirected to #{loc}")
130
+ if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
131
+ @linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
132
+ lg("--Added to linkStack for later")
133
+ return false
134
+ end
135
+ lg("Redirection outside of target host. No - go. #{loc}")
136
+ return false
137
+ end
138
+ if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
139
+ lg("UNSUCCESSFUL CONNECTION -- #{url}")
140
+ @connection_tally[:error] += 1
141
+ @connection_tally[:error_server] += 1 if resp.response_header.server_error?
142
+ @connection_tally[:error_client] += 1 if resp.response_header.client_error?
143
+ return false
144
+ end
145
+ if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
146
+ @already_crawled.insert(url)
147
+ @linkStack.delete(url)
148
+ lg("Page Not text/html -- #{url}")
149
+ return false
150
+ end
151
+ @connection_tally[:success] += 1
152
+ return true
153
+ end
154
+
120
155
  def asyncGetWave() #send a new wave of GET requests, using current @linkStack
121
156
  new_stuff = []
122
157
  EM.synchrony do
@@ -129,20 +164,27 @@ module Retriever
129
164
  next
130
165
  end
131
166
  resp = EventMachine::HttpRequest.new(url).get
132
- lg("URL Crawled: #{url}")
167
+ next if !good_response?(resp,url)
168
+ new_page = Retriever::Page.new(resp.response,@t)
169
+ lg("Page Fetched: #{url}")
133
170
  @already_crawled.insert(url)
134
171
  if @prgrss
135
172
  @progressbar.increment if @already_crawled.size < @maxPages
136
173
  end
137
- new_links_arr = self.fetchLinks(resp.response)
138
- if new_links_arr
139
- lg("#{new_links_arr.size} new links found")
140
- internal_links_arr = self.parseInternalLinks(new_links_arr)
174
+ if @seo
175
+ seos = [url]
176
+ seos.concat(new_page.parseSEO)
177
+ @data.push(seos)
178
+ lg("--page SEO scraped")
179
+ end
180
+ if new_page.links
181
+ lg("--#{new_page.links.size} links found")
182
+ internal_links_arr = new_page.parseInternalVisitable
141
183
  new_stuff.push(internal_links_arr)
142
184
  if @fh
143
- filez = self.parseFiles(new_links_arr)
144
- @fileStack.concat(filez) if !filez.empty?
145
- lg("#{filez.size} files found")
185
+ filez = new_page.parseFiles
186
+ @data.concat(filez) if !filez.empty?
187
+ lg("--#{filez.size} files found")
146
188
  end
147
189
  end
148
190
  end
@@ -151,8 +193,5 @@ module Retriever
151
193
  end
152
194
  new_stuff.uniq!
153
195
  end
154
- def parseFiles(all_links)
155
- all_links.select{ |linky| (@file_re =~ linky)}
156
- end
157
196
  end
158
197
  end
@@ -1,17 +1,16 @@
1
1
  module Retriever
2
2
  class FetchFiles < Fetch
3
- attr_reader :fileStack
4
3
  def initialize(url,options)
5
4
  super
6
- @fileStack = []
7
- all_links = self.fetchLinks(@t.source)
8
- @linkStack = self.parseInternalVisitableLinks(all_links)
5
+ @data = []
6
+ page_one = Retriever::Page.new(@t.source,@t)
7
+ @linkStack = page_one.parseInternalVisitable
9
8
  lg("URL Crawled: #{@t.target}")
10
- self.lg("#{@linkStack.size-1} new links found")
9
+ lg("#{@linkStack.size-1} new links found")
11
10
 
12
- tempFileCollection = self.parseFiles(all_links)
13
- @fileStack.concat(tempFileCollection) if tempFileCollection.size>0
14
- self.lg("#{@fileStack.size} new files found")
11
+ tempFileCollection = page_one.parseFiles
12
+ @data.concat(tempFileCollection) if tempFileCollection.size>0
13
+ lg("#{@data.size} new files found")
15
14
  errlog("Bad URL -- #{@t.target}") if !@linkStack
16
15
 
17
16
  @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
@@ -19,19 +18,14 @@ module Retriever
19
18
 
20
19
  self.async_crawl_and_collect()
21
20
 
22
- @fileStack.sort_by! {|x| x.length}
23
- @fileStack.uniq!
24
-
25
- self.dump(self.fileStack)
26
- self.write(@output,self.fileStack) if @output
27
- self.autodownload() if @autodown
21
+ @data.sort_by! {|x| x.length}
22
+ @data.uniq!
28
23
  end
29
24
  def download_file(path)
30
25
  arr = path.split('/')
31
26
  shortname = arr.pop
32
27
  puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
33
28
  File.open(shortname, "wb") do |saved_file|
34
- # the following "open" is provided by open-uri
35
29
  open(path) do |read_file|
36
30
  saved_file.write(read_file.read)
37
31
  end
@@ -39,7 +33,7 @@ module Retriever
39
33
  puts " SUCCESS: Download Complete"
40
34
  end
41
35
  def autodownload()
42
- lenny = @fileStack.count
36
+ lenny = @data.count
43
37
  puts "###################"
44
38
  puts "### Initiating Autodownload..."
45
39
  puts "###################"
@@ -53,7 +47,7 @@ module Retriever
53
47
  Dir.chdir("rr-downloads")
54
48
  end
55
49
  file_counter = 0
56
- @fileStack.each do |entry|
50
+ @data.each do |entry|
57
51
  begin
58
52
  self.download_file(entry)
59
53
  file_counter+=1
@@ -0,0 +1,23 @@
1
+ module Retriever
2
+ class FetchSEO < Fetch
3
+ def initialize(url,options)
4
+ super
5
+ @data = []
6
+ page_one = Retriever::Page.new(@t.source,@t)
7
+ @linkStack = page_one.parseInternalVisitable
8
+ lg("URL Crawled: #{@t.target}")
9
+ lg("#{@linkStack.size-1} new links found")
10
+
11
+ @data.push(page_one.parseSEO)
12
+ lg("#{@data.size} pages scraped")
13
+ errlog("Bad URL -- #{@t.target}") if !@linkStack
14
+
15
+ @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
16
+ @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
17
+
18
+ self.async_crawl_and_collect()
19
+
20
+ @data.sort_by! {|x| x[0].length}
21
+ end
22
+ end
23
+ end
@@ -1,38 +1,34 @@
1
1
  module Retriever
2
2
  class FetchSitemap < Fetch
3
- attr_reader :sitemap
4
3
  def initialize(url,options)
5
4
  super
6
- @sitemap = [@t.target]
7
- @linkStack = self.parseInternalVisitableLinks(self.fetchLinks(@t.source))
5
+ @data = [@t.target]
6
+ page_one = Retriever::Page.new(@t.source,@t)
7
+ @linkStack = page_one.parseInternalVisitable
8
8
  lg("URL Crawled: #{@t.target}")
9
- self.lg("#{@linkStack.size-1} new links found")
9
+ lg("#{@linkStack.size-1} new links found")
10
10
  errlog("Bad URL -- #{@t.target}") if !@linkStack
11
11
 
12
12
  @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
13
13
  @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
14
- @sitemap.concat(@linkStack)
14
+ @data.concat(@linkStack)
15
15
 
16
16
  self.async_crawl_and_collect()
17
17
 
18
- @sitemap.sort_by! {|x| x.length} if @sitemap.size>1
19
- @sitemap.uniq!
20
-
21
- self.dump(self.sitemap)
22
- self.write(self.sitemap) if /CSV/i =~ @s
23
- self.gen_xml(self.sitemap) if /XML/i =~ @s
18
+ @data.sort_by! {|x| x.length} if @data.size>1
19
+ @data.uniq!
24
20
  end
25
- def gen_xml(data)
21
+ def gen_xml
26
22
  f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
27
23
  f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
28
- data.each do |url|
24
+ @data.each do |url|
29
25
  f << "<url><loc>#{url}</loc></url>"
30
26
  end
31
27
  f << "</urlset>"
32
28
  f.close
33
29
  puts "###############################"
34
30
  puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
35
- puts "Object Count: #{@sitemap.size}"
31
+ puts "Object Count: #{@data.size}"
36
32
  puts "###############################"
37
33
  puts
38
34
  end
@@ -0,0 +1,6 @@
1
+ module OpenURI
2
+ def OpenURI.redirectable?(uri1, uri2) #nesc patch otherwise OPENURI blocks redirects to and from https
3
+ uri1.scheme.downcase == uri2.scheme.downcase ||
4
+ (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
5
+ end
6
+ end
@@ -0,0 +1,68 @@
1
+ module Retriever
2
+
3
+ class Page
4
+
5
+ HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
6
+ NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
7
+ HTTP_RE = Regexp.new(/^http/i).freeze
8
+ DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
9
+
10
+ TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
11
+ DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
12
+ H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
13
+ H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
14
+
15
+ attr_reader :links, :source, :t
16
+
17
+ def initialize(source,t)
18
+ @t = t
19
+ @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
20
+ @links = nil
21
+ end
22
+
23
+ #recieves page source as string
24
+ #returns array of unique href links
25
+ def links
26
+ return @links if @links
27
+ return false if !@source
28
+ @links = @source.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
29
+ link = match[0]
30
+ Link.new(@t.host, link).path
31
+ end.uniq
32
+ end
33
+
34
+ def parseInternal
35
+ links.select{ |linky| (@t.host_re =~ linky) }
36
+ end
37
+
38
+ def parseInternalVisitable
39
+ parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
40
+ end
41
+
42
+ def parseFiles
43
+ links.select{ |linky| (@t.file_re =~ linky)}
44
+ end
45
+
46
+ def title
47
+ TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
48
+ end
49
+
50
+ def desc
51
+ DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
52
+ end
53
+
54
+ def h1
55
+ H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
56
+ end
57
+
58
+ def h2
59
+ H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
60
+ end
61
+
62
+ def parseSEO
63
+ return [title,desc,h1,h2]
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -1,17 +1,22 @@
1
1
  require 'open-uri'
2
2
 
3
3
  module Retriever
4
+
4
5
  class Target
6
+
5
7
  HTTP_RE = Regexp.new(/^http/i).freeze
6
8
  DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
7
- attr_reader :host, :target, :host_re, :source
8
- def initialize(url)
9
+
10
+ attr_reader :host, :target, :host_re, :source, :file_re
11
+
12
+ def initialize(url,file_re=nil)
9
13
  url = "http://#{url}" if (!(HTTP_RE =~ url))
10
14
  fail "Bad URL" if (!(/\./ =~ url))
11
15
  new_uri = URI(url)
12
16
  @target = new_uri.to_s
13
17
  @host = new_uri.host
14
- @host_re = Regexp.new(@host).freeze
18
+ @host_re = Regexp.new(@host.sub('www.',''))
19
+ @file_re ||= file_re
15
20
  end
16
21
 
17
22
  def source
@@ -19,23 +24,29 @@ module Retriever
19
24
  begin
20
25
  resp = open(@target)
21
26
  rescue StandardError => e
22
- #puts e.message + " ## " + url
23
- #the trap abrt is nescessary to handle the SSL error
24
- #for some ungodly reason it's the only way I found to handle it
25
27
  trap("ABRT"){
26
28
  puts "#{@target} failed SSL Certification Verification"
27
29
  }
28
30
  return false
29
31
  end
30
- if (@target != resp.base_uri.to_s)
31
- fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s))
32
+ resp_url = resp.base_uri.to_s
33
+ if (@target != resp_url)
34
+ if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
35
+ new_t = Retriever::Target.new(resp_url)
36
+ @target = new_t.target
37
+ @host = new_t.host
38
+ return new_t.source
39
+ end
40
+ fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
32
41
  end
33
42
  resp = resp.read
34
43
  if resp == ""
35
44
  fail "Domain is not working. Try the non-WWW version."
36
45
  end
37
- return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
46
+ fail "Domain not working. Try HTTPS???" if !resp
47
+ return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
38
48
  end
39
49
 
40
50
  end
51
+
41
52
  end
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '0.1.4'
2
+ VERSION = '1.0.0'
3
3
  end
data/lib/retriever.rb CHANGED
@@ -1,8 +1,12 @@
1
1
  require 'retriever/fetch'
2
2
  require 'retriever/fetchfiles'
3
3
  require 'retriever/fetchsitemap'
4
+ require 'retriever/fetchseo'
5
+ require 'retriever/cli'
4
6
  require 'retriever/link'
5
7
  require 'retriever/target'
8
+ require 'retriever/page'
9
+ require 'retriever/openuri-redirect-patch'
6
10
 
7
11
  module Retriever
8
12
 
data/readme.md CHANGED
@@ -17,7 +17,7 @@ Install the gem
17
17
  ```sh
18
18
  gem install rubyretriever
19
19
  ```
20
-
20
+
21
21
  **Example: Sitemap mode**
22
22
  ```sh
23
23
  rr --sitemap CSV --progress --limit 100 http://www.cnet.com
@@ -31,14 +31,25 @@ This would go to http://www.cnet.com and map it until it crawled a max of 100 pa
31
31
 
32
32
  **Example: File Harvesting mode**
33
33
  ```sh
34
- rr --files pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
34
+ rr --files pdf --progress --limit 1000 --out hubspot http://www.hubspot.com
35
+ ```
36
+ OR -- SAME COMMAND
37
+ ```sh
38
+ rr -f pdf -p -l 100 http://www.hubspot.com
39
+ ```
40
+
41
+ This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 100 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
42
+
43
+ **Example: SEO mode**
44
+ ```sh
45
+ rr --seo --progress --limit 100 --out cnet-seo http://www.cnet.com
35
46
  ```
36
47
  OR -- SAME COMMAND
37
48
  ```sh
38
- rr -f pdf -p -l 1000 http://www.hubspot.com
49
+ rr -e -p -l 10 -o cnet-seo http://www.cnet.com
39
50
  ```
40
51
 
41
- This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
52
+ This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would be collecting the onpage SEO fields on those pages - currently this means [url, page title, meta description, h1 text, h2 text], and then it would write it out to a csv named cnet-seo.
42
53
 
43
54
 
44
55
  command-line arguments
@@ -47,10 +58,11 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
47
58
 
48
59
  Where MODE FLAG is required, and is either:
49
60
  -s, --sitemap FORMAT (only accepts CSV or XML atm)
50
- -f, --files FILETYPE
61
+ -f, --files FILETYPE
62
+ -e, --seo
51
63
 
52
64
  and OPTIONS is the applicable:
53
- -o, --out FILENAME *Dump output to selected filename --being phased out*
65
+ -o, --out FILENAME *Dump fetch data as CSV*
54
66
  -p, --progress *Outputs a progressbar*
55
67
  -v, --verbose *Output more information*
56
68
  -l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
data/spec/link_spec.rb CHANGED
@@ -2,8 +2,8 @@ require 'retriever'
2
2
 
3
3
  describe "Link" do
4
4
 
5
- r = Retriever::Fetch.new("http://www.cnet.com/reviews/", {})
6
- let(:links) { r.fetchLinks(@source) }
5
+ t = Retriever::Target.new("http://www.cnet.com/reviews/")
6
+ let(:links) { Retriever::Page.new(@source,t).links }
7
7
 
8
8
  it "collects links in anchor tags" do
9
9
  @source = (<<SOURCE).strip
data/spec/page_spec.rb ADDED
@@ -0,0 +1,94 @@
1
+ require 'retriever/page'
2
+ require 'retriever/fetch'
3
+
4
+ t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
5
+
6
+ describe "Page" do
7
+
8
+ describe "#links" do
9
+ let (:links){Retriever::Page.new(@source,t).links}
10
+ it "collects all unique href links on the page" do
11
+ @source = (<<SOURCE).strip
12
+ <a href='www.cnet.com/download.exe'>download</a>
13
+ <a href='/test.html'>test</a>
14
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
15
+ <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
16
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
17
+ SOURCE
18
+
19
+ expect(links).to have(4).items
20
+ end
21
+ end
22
+
23
+ describe "#parseInternal" do
24
+ let (:links){Retriever::Page.new(@source,t).parseInternal}
25
+ it "filters links by host" do
26
+ @source = (<<SOURCE).strip
27
+ <a href='http://www.cnet.com/'>download</a>
28
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
29
+ SOURCE
30
+
31
+ expect(links).to have(1).items
32
+ end
33
+ end
34
+
35
+ describe "#parseInternalVisitable" do
36
+ let (:links){Retriever::Page.new(@source,t).parseInternalVisitable}
37
+ it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
38
+ @source = (<<SOURCE).strip
39
+ <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
40
+ SOURCE
41
+ expect(links).to have(0).items
42
+ end
43
+ end
44
+
45
+ describe "#parseFiles" do
46
+ let (:links){Retriever::Page.new(@source,t).parseFiles}
47
+ it "filters links by filetype" do
48
+ @source = (<<SOURCE).strip
49
+ <a href='www.cnet.com/download.exe'>download</a>
50
+ http://www.google.com
51
+ <a href='/test.html'>test</a>
52
+ SOURCE
53
+ expect(links).to have(1).items
54
+ end
55
+ end
56
+
57
+ describe "#title" do
58
+ let (:page){Retriever::Page.new(@source,t)}
59
+ it "returns page title" do
60
+ @source = (<<SOURCE).strip
61
+ <title>test</title>
62
+ SOURCE
63
+ expect(page.title).to eq('test')
64
+ end
65
+ end
66
+ describe "#desc" do
67
+ let (:page){Retriever::Page.new(@source,t)}
68
+ it "returns meta description" do
69
+ @source = (<<SOURCE).strip
70
+ <meta name='description' content="test2 ">
71
+ SOURCE
72
+ expect(page.desc).to eq('test2 ')
73
+ end
74
+ end
75
+ describe "#h1" do
76
+ let (:page){Retriever::Page.new(@source,t)}
77
+ it "returns h1 text" do
78
+ @source = (<<SOURCE).strip
79
+ <h1>test 3</h1>
80
+ SOURCE
81
+ expect(page.h1).to eq('test 3')
82
+ end
83
+ end
84
+ describe "#h2" do
85
+ let (:page){Retriever::Page.new(@source,t)}
86
+ it "returns h2 text" do
87
+ @source = (<<SOURCE).strip
88
+ <h2> test 4 </h2>
89
+ SOURCE
90
+ expect(page.h2).to eq(' test 4 ')
91
+ end
92
+ end
93
+
94
+ end
@@ -1,52 +1,5 @@
1
1
  require 'retriever'
2
2
 
3
- r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
4
- test_html = "<a href='www.cnet.com/download.exe'>download</a>
5
- http://www.google.com
6
- <a href='/test.html'>test</a>
7
- <a href='http://www.cnet.com/products/gadgets#view-comments'>gadgets comments</a>
8
- <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
9
- <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
10
- <a href='http://www.yahoo.com/test/'>yahoo</a>
11
- test.com
12
- <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
13
- <a href='cpage_18'>about</a>"
14
-
15
- links_collection = r.fetchLinks(test_html)
16
-
17
3
  describe "Fetch" do
18
-
19
- describe "#fetchLinks" do
20
- it "collects all unique href links on the page" do
21
- expect(links_collection).to have(6).items
22
- end
23
- end
24
-
25
- describe "#parseInternalLinks" do
26
- let (:filtered_links) {r.parseInternalLinks(links_collection)}
27
- it "filters links by host" do
28
- filtered_links.each do |link|
29
- expect(link).to include("www.cnet.com")
30
- end
31
- end
32
- end
33
-
34
- describe "#parseInternalVisitableLinks" do
35
- let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
36
- it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
37
- filtered_links.each do |link|
38
- expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
39
- end
40
- end
41
- end
42
-
43
- describe "#parseFiles" do
44
- let(:file_list) {r.parseFiles(links_collection)}
45
- it "filters links by filetype" do
46
- file_list.each do |link|
47
- expect(link).to include(".exe")
48
- end
49
- end
50
- end
51
-
4
+
52
5
  end
data/spec/target_spec.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require 'retriever'
2
2
  require 'open-uri'
3
3
 
4
- t = Retriever::Target.new("http://www.cnet.com/reviews/")
4
+ t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
5
5
 
6
6
  describe "Target" do
7
7
 
@@ -14,7 +14,11 @@ describe "Target" do
14
14
  end
15
15
 
16
16
  it "creates host_re var" do
17
- expect(t.host_re).to eq(/www.cnet.com/)
17
+ expect(t.host_re).to eq(/cnet.com/)
18
+ end
19
+
20
+ it "creates file_re var (when provided)" do
21
+ expect(t.file_re).to eq(/\.exe\z/)
18
22
  end
19
23
 
20
24
  it "adds protocol to Target URL if none given" do
@@ -34,6 +38,7 @@ describe "Target" do
34
38
  it "fails if target redirects to new host" do
35
39
  expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
36
40
  end
41
+
37
42
  end
38
43
 
39
44
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-26 00:00:00.000000000 Z
11
+ date: 2014-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony
@@ -119,18 +119,23 @@ files:
119
119
  - LICENSE
120
120
  - bin/rr
121
121
  - lib/retriever.rb
122
+ - lib/retriever/cli.rb
122
123
  - lib/retriever/fetch.rb
123
124
  - lib/retriever/fetchfiles.rb
125
+ - lib/retriever/fetchseo.rb
124
126
  - lib/retriever/fetchsitemap.rb
125
127
  - lib/retriever/link.rb
128
+ - lib/retriever/openuri-redirect-patch.rb
129
+ - lib/retriever/page.rb
126
130
  - lib/retriever/target.rb
127
131
  - lib/retriever/version.rb
128
132
  - readme.md
129
133
  - spec/link_spec.rb
134
+ - spec/page_spec.rb
130
135
  - spec/retriever_spec.rb
131
136
  - spec/spec_helper.rb
132
137
  - spec/target_spec.rb
133
- homepage: http://www.softwarebyjoe.com/rubyretriever/
138
+ homepage: http://softwarebyjoe.com/rubyretriever/
134
139
  licenses:
135
140
  - MIT
136
141
  metadata: {}