rubyretriever 0.1.4 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 924c9958e88587353cc80f4b134cca91f73f3e57
4
- data.tar.gz: 393457cd37ad3fb372008a7829c8028f658f2b58
3
+ metadata.gz: 80eba5c4fdf8c33a19ca35ba37b4493cf44ab57a
4
+ data.tar.gz: 4dbca842c4f56060e13cfe1c0acf0256321df573
5
5
  SHA512:
6
- metadata.gz: 5dcde12eb9fea2181b6a91c0d798351b78efa80652547afd02db536ab9d139de2969b08326d61363435baf04cc4036a0ed0a4cbdd9c884bd05314b8210c38938
7
- data.tar.gz: 8c226a13d4e0b29beffc1940b6ca05ff9f4ae403decc1990a2a6418f90fca12e132852c48eea082918d74cb593006e98ed14ffbe9366d9dd64ef0f058eefd7a2
6
+ metadata.gz: 48181c41247d85b16db74eb8b7c0a74c23c9740d00d2fd79ecfdb8435efa64d81fb34ccbc81e32911d7fe0e6942c6f7c7c9f91d39feeb434ab078f659ada1341
7
+ data.tar.gz: 3dace96b1bd42fa2292e8a9db3506983d688e7877b12ca30e550a81be9bad6a7de9907eb2f2b75c6263146a8721d08437dfabe06a493de057dc8a764b57c3a39
data/bin/rr CHANGED
@@ -1,18 +1,24 @@
1
1
  #! /usr/bin/env ruby
2
+
2
3
  require 'retriever'
3
4
  require 'optparse'
5
+
4
6
  options = {}
5
7
  optparse = OptionParser.new do|opts|
6
8
  # Set a banner, displayed at the top
7
9
  # of the help screen.
8
10
  opts.banner = "Usage: rr [MODE FLAG] [options] Target_URL"
9
11
  options[:sitemap] = false
10
- opts.on( '-s', '--sitemap FORMAT', 'MODE FLAG: Sitemap mode - Crawl site and output sitemap, format choices: CSV or XML' ) do |output_type|
11
- options[:sitemap] = output_type
12
+ opts.on( '-s', '--sitemap [FORMAT]', 'MODE FLAG: Sitemap mode' ) do |output_type|
13
+ options[:sitemap] = output_type||''
12
14
  end
13
15
  options[:fileharvest] = false
14
- opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode - Crawl site and collect links for files found, extension for filetype' ) do |file_ext|
16
+ opts.on( '-f', '--files FILETYPE', 'MODE FLAG: Fileharvest mode' ) do |file_ext|
15
17
  options[:fileharvest] = file_ext
18
+ end
19
+ options[:seo] = false
20
+ opts.on( '-e', '--seo', 'MODE FLAG: SEO mode' ) do
21
+ options[:seo] = true
16
22
  end
17
23
  options[:filename] = nil
18
24
  opts.on( '-o', '--out FILENAME', 'Dump output to selected filename' ) do|filename|
@@ -56,14 +62,14 @@ ARGV.each do|q|
56
62
  puts "### Outputting in format: #{options[:sitemap]}" if options[:sitemap]
57
63
  puts "### Performing File Harvest" if options[:fileharvest]
58
64
  puts "### Searching for file extension: #{options[:fileharvest]} pages" if (options[:fileharvest])
65
+ puts "### Performing SEO Scrape" if options[:seo]
59
66
  puts "### Writting output to filename: #{options[:filename]}" if options[:filename]
60
67
  puts "### Being verbose"
61
- puts "### Stopping after #{options[:maxpages]} pages" if options[:maxpages]
68
+ puts "### Stopping after #{options[:maxpages]} pages"
62
69
  end
63
70
  puts "###############################"
64
71
  puts "### [RubyRetriever] go fetch #{q}"
65
- Retriever::FetchFiles.new(q, options) if options[:fileharvest]
66
- Retriever::FetchSitemap.new(q, options) if options[:sitemap]
72
+ Retriever::CLI.new(q, options)
67
73
  puts "### [RubyRetriever] is done."
68
74
  puts "###############################"
69
75
  puts
@@ -0,0 +1,27 @@
1
+ module Retriever
2
+ class CLI
3
+ def initialize(url,options)
4
+
5
+ #kick off the fetch mode of choice
6
+ if options[:fileharvest]
7
+ @fetch = Retriever::FetchFiles.new(url, options)
8
+ elsif options[:sitemap]
9
+ @fetch = Retriever::FetchSitemap.new(url, options)
10
+ elsif options[:seo]
11
+ @fetch = Retriever::FetchSEO.new(url, options)
12
+ else
13
+ fail "### Error: No Mode Selected"
14
+ end
15
+
16
+ #all fetch modes
17
+ @fetch.dump
18
+ @fetch.write if options[:filename]
19
+
20
+ #fileharvest only
21
+ @fetch.autodownload if options[:autodown] && options[:fileharvest]
22
+
23
+ #sitemap only
24
+ @fetch.gen_xml if /XML/i =~ options[:sitemap].to_s
25
+ end
26
+ end
27
+ end
@@ -9,12 +9,14 @@ require 'bloomfilter-rb'
9
9
  module Retriever
10
10
  class Fetch
11
11
  attr_reader :maxPages, :t
12
- #constants
13
- HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
14
- NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico)/ix).freeze
15
12
 
16
13
  def initialize(url,options)
17
- @t = Retriever::Target.new(url)
14
+ @connection_tally = {
15
+ :success => 0,
16
+ :error => 0,
17
+ :error_client => 0,
18
+ :error_server => 0
19
+ }
18
20
  #OPTIONS
19
21
  @prgrss = options[:progress] ? options[:progress] : false
20
22
  @maxPages = options[:maxpages] ? options[:maxpages].to_i : 100
@@ -23,6 +25,7 @@ module Retriever
23
25
  @fh = options[:fileharvest] ? options[:fileharvest] : false
24
26
  @file_ext = @fh.to_s
25
27
  @s = options[:sitemap] ? options[:sitemap] : false
28
+ @seo = options[:seo] ? true : false
26
29
  @autodown = options[:autodown] ? true : false
27
30
  #
28
31
  if @fh
@@ -30,9 +33,6 @@ module Retriever
30
33
  @file_re = Regexp.new(tempExtStr).freeze
31
34
  else
32
35
  errlog("Cannot AUTODOWNLOAD when not in FILEHARVEST MODE") if @autodown #when FH is not true, and autodown is true
33
- if !@output
34
- @output = "rr-#{@t.host.split('.')[1]}"
35
- end
36
36
  end
37
37
  if @prgrss
38
38
  errlog("CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME, CHOOSE ONE, -v or -p") if @v #verbose & progressbar conflict
@@ -44,8 +44,13 @@ module Retriever
44
44
  }
45
45
  @progressbar = ProgressBar.create(prgressVars)
46
46
  end
47
+ @t = Retriever::Target.new(url,@file_re)
47
48
  @already_crawled = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
48
49
  @already_crawled.insert(@t.target)
50
+ if (@fh && !@output)
51
+ @output = "rr-#{@t.host.split('.')[1]}"
52
+ end
53
+ fail "bad page source on target -- try HTTPS?" if !@t.source
49
54
  end
50
55
  def errlog(msg)
51
56
  raise "ERROR: #{msg}"
@@ -53,52 +58,52 @@ module Retriever
53
58
  def lg(msg)
54
59
  puts "### #{msg}" if @v
55
60
  end
56
- def dump(data)
61
+ def dump
57
62
  puts "###############################"
63
+ if @v
64
+ puts "Connection Tally:"
65
+ puts @connection_tally.to_s
66
+ puts "###############################"
67
+ end
58
68
  if @s
59
69
  puts "#{@t.target} Sitemap"
60
- puts "Page Count: #{data.size}"
70
+ puts "Page Count: #{@data.size}"
61
71
  elsif @fh
62
72
  puts "Target URL: #{@t.target}"
63
73
  puts "Filetype: #{@file_ext}"
64
- puts "File Count: #{data.size}"
74
+ puts "File Count: #{@data.size}"
75
+ elsif @seo
76
+ puts "#{@t.target} SEO Metrics"
77
+ puts "Page Count: #{@data.size}"
65
78
  else
66
- puts "ERROR"
79
+ fail "ERROR - Cannot dump - Mode Not Found"
67
80
  end
68
81
  puts "###############################"
69
- puts data
82
+ @data.each do |line|
83
+ puts line
84
+ end
70
85
  puts "###############################"
71
86
  puts
72
87
  end
73
- def write(data)
88
+ def write
74
89
  if @output
90
+ i = 0
75
91
  CSV.open("#{@output}.csv", "w") do |csv|
76
- data.each do |entry|
77
- csv << [entry]
78
- end
92
+ if ((i == 0) && @seo)
93
+ csv << ['URL','Page Title','Meta Description','H1','H2']
94
+ i +=1
95
+ end
96
+ @data.each do |entry|
97
+ csv << entry
98
+ end
79
99
  end
80
100
  puts "###############################"
81
101
  puts "File Created: #{@output}.csv"
82
- puts "Object Count: #{data.size}"
102
+ puts "Object Count: #{@data.size}"
83
103
  puts "###############################"
84
104
  puts
85
105
  end
86
106
  end
87
- #recieves page source as string
88
- #returns array of unique href links
89
- def fetchLinks(doc)
90
- return false if !doc
91
- doc.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
92
- link = match[0]
93
- Link.new(@t.host, link).path
94
- end.uniq
95
- end
96
- def parseInternalLinks(all_links)
97
- all_links.select{ |linky| (@t.host_re =~ linky) }
98
- end
99
- def parseInternalVisitableLinks(all_links)
100
- parseInternalLinks(all_links).select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
101
- end
102
107
  def async_crawl_and_collect()
103
108
  while (@already_crawled.size < @maxPages)
104
109
  if @linkStack.empty?
@@ -112,11 +117,41 @@ module Retriever
112
117
  new_links_arr = self.asyncGetWave()
113
118
  next if (new_links_arr.nil? || new_links_arr.empty?)
114
119
  new_link_arr = new_links_arr-@linkStack#set operations to see are these in our previous visited pages arr?
115
- @linkStack.concat(new_links_arr)
116
- @sitemap.concat(new_links_arr) if @s
120
+ @linkStack.concat(new_links_arr).uniq!
121
+ @data.concat(new_links_arr) if @s
117
122
  end
118
123
  @progressbar.finish if @prgrss
119
124
  end
125
+ def good_response?(resp, url) #returns true is resp is ok to continue process, false is we need to 'next' it
126
+ return false if !resp
127
+ if resp.response_header.redirection? #we got redirected
128
+ loc = resp.response_header.location
129
+ lg("#{url} Redirected to #{loc}")
130
+ if t.host_re =~ loc #if being redirected to same host, let's add to linkstack
131
+ @linkStack.push(loc) if !@already_crawled.include?(loc) #but only if we haven't already crawled it
132
+ lg("--Added to linkStack for later")
133
+ return false
134
+ end
135
+ lg("Redirection outside of target host. No - go. #{loc}")
136
+ return false
137
+ end
138
+ if (!resp.response_header.successful?) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
139
+ lg("UNSUCCESSFUL CONNECTION -- #{url}")
140
+ @connection_tally[:error] += 1
141
+ @connection_tally[:error_server] += 1 if resp.response_header.server_error?
142
+ @connection_tally[:error_client] += 1 if resp.response_header.client_error?
143
+ return false
144
+ end
145
+ if (!(resp.response_header['CONTENT_TYPE'].include?("text/html"))) #if webpage is not text/html, let's not continue and lets also make sure we dont re-queue it
146
+ @already_crawled.insert(url)
147
+ @linkStack.delete(url)
148
+ lg("Page Not text/html -- #{url}")
149
+ return false
150
+ end
151
+ @connection_tally[:success] += 1
152
+ return true
153
+ end
154
+
120
155
  def asyncGetWave() #send a new wave of GET requests, using current @linkStack
121
156
  new_stuff = []
122
157
  EM.synchrony do
@@ -129,20 +164,27 @@ module Retriever
129
164
  next
130
165
  end
131
166
  resp = EventMachine::HttpRequest.new(url).get
132
- lg("URL Crawled: #{url}")
167
+ next if !good_response?(resp,url)
168
+ new_page = Retriever::Page.new(resp.response,@t)
169
+ lg("Page Fetched: #{url}")
133
170
  @already_crawled.insert(url)
134
171
  if @prgrss
135
172
  @progressbar.increment if @already_crawled.size < @maxPages
136
173
  end
137
- new_links_arr = self.fetchLinks(resp.response)
138
- if new_links_arr
139
- lg("#{new_links_arr.size} new links found")
140
- internal_links_arr = self.parseInternalLinks(new_links_arr)
174
+ if @seo
175
+ seos = [url]
176
+ seos.concat(new_page.parseSEO)
177
+ @data.push(seos)
178
+ lg("--page SEO scraped")
179
+ end
180
+ if new_page.links
181
+ lg("--#{new_page.links.size} links found")
182
+ internal_links_arr = new_page.parseInternalVisitable
141
183
  new_stuff.push(internal_links_arr)
142
184
  if @fh
143
- filez = self.parseFiles(new_links_arr)
144
- @fileStack.concat(filez) if !filez.empty?
145
- lg("#{filez.size} files found")
185
+ filez = new_page.parseFiles
186
+ @data.concat(filez) if !filez.empty?
187
+ lg("--#{filez.size} files found")
146
188
  end
147
189
  end
148
190
  end
@@ -151,8 +193,5 @@ module Retriever
151
193
  end
152
194
  new_stuff.uniq!
153
195
  end
154
- def parseFiles(all_links)
155
- all_links.select{ |linky| (@file_re =~ linky)}
156
- end
157
196
  end
158
197
  end
@@ -1,17 +1,16 @@
1
1
  module Retriever
2
2
  class FetchFiles < Fetch
3
- attr_reader :fileStack
4
3
  def initialize(url,options)
5
4
  super
6
- @fileStack = []
7
- all_links = self.fetchLinks(@t.source)
8
- @linkStack = self.parseInternalVisitableLinks(all_links)
5
+ @data = []
6
+ page_one = Retriever::Page.new(@t.source,@t)
7
+ @linkStack = page_one.parseInternalVisitable
9
8
  lg("URL Crawled: #{@t.target}")
10
- self.lg("#{@linkStack.size-1} new links found")
9
+ lg("#{@linkStack.size-1} new links found")
11
10
 
12
- tempFileCollection = self.parseFiles(all_links)
13
- @fileStack.concat(tempFileCollection) if tempFileCollection.size>0
14
- self.lg("#{@fileStack.size} new files found")
11
+ tempFileCollection = page_one.parseFiles
12
+ @data.concat(tempFileCollection) if tempFileCollection.size>0
13
+ lg("#{@data.size} new files found")
15
14
  errlog("Bad URL -- #{@t.target}") if !@linkStack
16
15
 
17
16
  @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
@@ -19,19 +18,14 @@ module Retriever
19
18
 
20
19
  self.async_crawl_and_collect()
21
20
 
22
- @fileStack.sort_by! {|x| x.length}
23
- @fileStack.uniq!
24
-
25
- self.dump(self.fileStack)
26
- self.write(@output,self.fileStack) if @output
27
- self.autodownload() if @autodown
21
+ @data.sort_by! {|x| x.length}
22
+ @data.uniq!
28
23
  end
29
24
  def download_file(path)
30
25
  arr = path.split('/')
31
26
  shortname = arr.pop
32
27
  puts "Initiating Download to: #{'/rr-downloads/' + shortname}"
33
28
  File.open(shortname, "wb") do |saved_file|
34
- # the following "open" is provided by open-uri
35
29
  open(path) do |read_file|
36
30
  saved_file.write(read_file.read)
37
31
  end
@@ -39,7 +33,7 @@ module Retriever
39
33
  puts " SUCCESS: Download Complete"
40
34
  end
41
35
  def autodownload()
42
- lenny = @fileStack.count
36
+ lenny = @data.count
43
37
  puts "###################"
44
38
  puts "### Initiating Autodownload..."
45
39
  puts "###################"
@@ -53,7 +47,7 @@ module Retriever
53
47
  Dir.chdir("rr-downloads")
54
48
  end
55
49
  file_counter = 0
56
- @fileStack.each do |entry|
50
+ @data.each do |entry|
57
51
  begin
58
52
  self.download_file(entry)
59
53
  file_counter+=1
@@ -0,0 +1,23 @@
1
+ module Retriever
2
+ class FetchSEO < Fetch
3
+ def initialize(url,options)
4
+ super
5
+ @data = []
6
+ page_one = Retriever::Page.new(@t.source,@t)
7
+ @linkStack = page_one.parseInternalVisitable
8
+ lg("URL Crawled: #{@t.target}")
9
+ lg("#{@linkStack.size-1} new links found")
10
+
11
+ @data.push(page_one.parseSEO)
12
+ lg("#{@data.size} pages scraped")
13
+ errlog("Bad URL -- #{@t.target}") if !@linkStack
14
+
15
+ @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
16
+ @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
17
+
18
+ self.async_crawl_and_collect()
19
+
20
+ @data.sort_by! {|x| x[0].length}
21
+ end
22
+ end
23
+ end
@@ -1,38 +1,34 @@
1
1
  module Retriever
2
2
  class FetchSitemap < Fetch
3
- attr_reader :sitemap
4
3
  def initialize(url,options)
5
4
  super
6
- @sitemap = [@t.target]
7
- @linkStack = self.parseInternalVisitableLinks(self.fetchLinks(@t.source))
5
+ @data = [@t.target]
6
+ page_one = Retriever::Page.new(@t.source,@t)
7
+ @linkStack = page_one.parseInternalVisitable
8
8
  lg("URL Crawled: #{@t.target}")
9
- self.lg("#{@linkStack.size-1} new links found")
9
+ lg("#{@linkStack.size-1} new links found")
10
10
  errlog("Bad URL -- #{@t.target}") if !@linkStack
11
11
 
12
12
  @linkStack.delete(@t.target) if @linkStack.include?(@t.target)
13
13
  @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages)
14
- @sitemap.concat(@linkStack)
14
+ @data.concat(@linkStack)
15
15
 
16
16
  self.async_crawl_and_collect()
17
17
 
18
- @sitemap.sort_by! {|x| x.length} if @sitemap.size>1
19
- @sitemap.uniq!
20
-
21
- self.dump(self.sitemap)
22
- self.write(self.sitemap) if /CSV/i =~ @s
23
- self.gen_xml(self.sitemap) if /XML/i =~ @s
18
+ @data.sort_by! {|x| x.length} if @data.size>1
19
+ @data.uniq!
24
20
  end
25
- def gen_xml(data)
21
+ def gen_xml
26
22
  f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+')
27
23
  f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
28
- data.each do |url|
24
+ @data.each do |url|
29
25
  f << "<url><loc>#{url}</loc></url>"
30
26
  end
31
27
  f << "</urlset>"
32
28
  f.close
33
29
  puts "###############################"
34
30
  puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml"
35
- puts "Object Count: #{@sitemap.size}"
31
+ puts "Object Count: #{@data.size}"
36
32
  puts "###############################"
37
33
  puts
38
34
  end
@@ -0,0 +1,6 @@
1
+ module OpenURI
2
+ def OpenURI.redirectable?(uri1, uri2) #nesc patch otherwise OPENURI blocks redirects to and from https
3
+ uri1.scheme.downcase == uri2.scheme.downcase ||
4
+ (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
5
+ end
6
+ end
@@ -0,0 +1,68 @@
1
+ module Retriever
2
+
3
+ class Page
4
+
5
+ HREF_CONTENTS_RE = Regexp.new(/\shref=['|"]([^\s][a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+)['|"][\s|\W]/ix).freeze
6
+ NONPAGE_EXT_RE = Regexp.new(/\.(?:css|js|png|gif|jpg|mp4|wmv|flv|mp3|wav|doc|txt|ico|xml)/ix).freeze
7
+ HTTP_RE = Regexp.new(/^http/i).freeze
8
+ DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
9
+
10
+ TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
11
+ DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\'][^>]*content=[\"]([^\"]*)[\"][^>]*>/i).freeze
12
+ H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
13
+ H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
14
+
15
+ attr_reader :links, :source, :t
16
+
17
+ def initialize(source,t)
18
+ @t = t
19
+ @source = source.encode('UTF-8', :invalid => :replace, :undef => :replace)
20
+ @links = nil
21
+ end
22
+
23
+ #recieves page source as string
24
+ #returns array of unique href links
25
+ def links
26
+ return @links if @links
27
+ return false if !@source
28
+ @links = @source.scan(HREF_CONTENTS_RE).map do |match| #filter some malformed URLS that come in, this is meant to be a loose filter to catch all reasonable HREF attributes.
29
+ link = match[0]
30
+ Link.new(@t.host, link).path
31
+ end.uniq
32
+ end
33
+
34
+ def parseInternal
35
+ links.select{ |linky| (@t.host_re =~ linky) }
36
+ end
37
+
38
+ def parseInternalVisitable
39
+ parseInternal.select{ |linky| (!(NONPAGE_EXT_RE =~linky)) }
40
+ end
41
+
42
+ def parseFiles
43
+ links.select{ |linky| (@t.file_re =~ linky)}
44
+ end
45
+
46
+ def title
47
+ TITLE_RE =~ @source ? @source.match(TITLE_RE)[1] : ""
48
+ end
49
+
50
+ def desc
51
+ DESC_RE =~ @source ? @source.match(DESC_RE)[1] : ""
52
+ end
53
+
54
+ def h1
55
+ H1_RE =~ @source ? @source.match(H1_RE)[1] : ""
56
+ end
57
+
58
+ def h2
59
+ H2_RE =~ @source ? @source.match(H2_RE)[1] : ""
60
+ end
61
+
62
+ def parseSEO
63
+ return [title,desc,h1,h2]
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -1,17 +1,22 @@
1
1
  require 'open-uri'
2
2
 
3
3
  module Retriever
4
+
4
5
  class Target
6
+
5
7
  HTTP_RE = Regexp.new(/^http/i).freeze
6
8
  DUB_DUB_DUB_DOT_RE = Regexp.new(/^www\./i).freeze
7
- attr_reader :host, :target, :host_re, :source
8
- def initialize(url)
9
+
10
+ attr_reader :host, :target, :host_re, :source, :file_re
11
+
12
+ def initialize(url,file_re=nil)
9
13
  url = "http://#{url}" if (!(HTTP_RE =~ url))
10
14
  fail "Bad URL" if (!(/\./ =~ url))
11
15
  new_uri = URI(url)
12
16
  @target = new_uri.to_s
13
17
  @host = new_uri.host
14
- @host_re = Regexp.new(@host).freeze
18
+ @host_re = Regexp.new(@host.sub('www.',''))
19
+ @file_re ||= file_re
15
20
  end
16
21
 
17
22
  def source
@@ -19,23 +24,29 @@ module Retriever
19
24
  begin
20
25
  resp = open(@target)
21
26
  rescue StandardError => e
22
- #puts e.message + " ## " + url
23
- #the trap abrt is nescessary to handle the SSL error
24
- #for some ungodly reason it's the only way I found to handle it
25
27
  trap("ABRT"){
26
28
  puts "#{@target} failed SSL Certification Verification"
27
29
  }
28
30
  return false
29
31
  end
30
- if (@target != resp.base_uri.to_s)
31
- fail "Domain redirecting to new host: #{resp.base_uri.to_s}" if (!(@host_re =~ resp.base_uri.to_s))
32
+ resp_url = resp.base_uri.to_s
33
+ if (@target != resp_url)
34
+ if @host_re =~ resp_url #if redirect URL is same hose, we want to re-sync our target with the right URL
35
+ new_t = Retriever::Target.new(resp_url)
36
+ @target = new_t.target
37
+ @host = new_t.host
38
+ return new_t.source
39
+ end
40
+ fail "Domain redirecting to new host: #{resp.base_uri.to_s}" #if it's not same host, we want to fail
32
41
  end
33
42
  resp = resp.read
34
43
  if resp == ""
35
44
  fail "Domain is not working. Try the non-WWW version."
36
45
  end
37
- return resp.encode('UTF-8', :invalid => :replace, :undef => :replace) #.force_encoding('UTF-8') #ran into issues with some sites without forcing UTF8 encoding, and also issues with it. Not sure atm.
46
+ fail "Domain not working. Try HTTPS???" if !resp
47
+ return resp.encode('UTF-8', 'binary', :invalid => :replace, :undef => :replace) #consider using scrub from ruby 2.1? this misses some things
38
48
  end
39
49
 
40
50
  end
51
+
41
52
  end
@@ -1,3 +1,3 @@
1
1
  module Retriever
2
- VERSION = '0.1.4'
2
+ VERSION = '1.0.0'
3
3
  end
data/lib/retriever.rb CHANGED
@@ -1,8 +1,12 @@
1
1
  require 'retriever/fetch'
2
2
  require 'retriever/fetchfiles'
3
3
  require 'retriever/fetchsitemap'
4
+ require 'retriever/fetchseo'
5
+ require 'retriever/cli'
4
6
  require 'retriever/link'
5
7
  require 'retriever/target'
8
+ require 'retriever/page'
9
+ require 'retriever/openuri-redirect-patch'
6
10
 
7
11
  module Retriever
8
12
 
data/readme.md CHANGED
@@ -17,7 +17,7 @@ Install the gem
17
17
  ```sh
18
18
  gem install rubyretriever
19
19
  ```
20
-
20
+
21
21
  **Example: Sitemap mode**
22
22
  ```sh
23
23
  rr --sitemap CSV --progress --limit 100 http://www.cnet.com
@@ -31,14 +31,25 @@ This would go to http://www.cnet.com and map it until it crawled a max of 100 pa
31
31
 
32
32
  **Example: File Harvesting mode**
33
33
  ```sh
34
- rr --files pdf --progress --limit 1000 --output hubspot http://www.hubspot.com
34
+ rr --files pdf --progress --limit 1000 --out hubspot http://www.hubspot.com
35
+ ```
36
+ OR -- SAME COMMAND
37
+ ```sh
38
+ rr -f pdf -p -l 100 http://www.hubspot.com
39
+ ```
40
+
41
+ This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 100 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
42
+
43
+ **Example: SEO mode**
44
+ ```sh
45
+ rr --seo --progress --limit 100 --out cnet-seo http://www.cnet.com
35
46
  ```
36
47
  OR -- SAME COMMAND
37
48
  ```sh
38
- rr -f pdf -p -l 1000 http://www.hubspot.com
49
+ rr -e -p -l 10 -o cnet-seo http://www.cnet.com
39
50
  ```
40
51
 
41
- This would go to http://www.hubspot.com and crawl it looking for filetype:PDF until it crawled a max of 1,000 pages, and then it would write out a list of filepaths to a csv named hubspot (based on the website host name. Optionally we could have the script then go and autodownload all the files by adding the -a/--auto flag -- however this current example would just dump to stdout a list of all the PDF's found.
52
+ This would go to http://www.cnet.com and crawl a max of 100 pages, during which it would be collecting the onpage SEO fields on those pages - currently this means [url, page title, meta description, h1 text, h2 text], and then it would write it out to a csv named cnet-seo.
42
53
 
43
54
 
44
55
  command-line arguments
@@ -47,10 +58,11 @@ Usage: rr [MODE FLAG] [OPTIONS] Target_URL
47
58
 
48
59
  Where MODE FLAG is required, and is either:
49
60
  -s, --sitemap FORMAT (only accepts CSV or XML atm)
50
- -f, --files FILETYPE
61
+ -f, --files FILETYPE
62
+ -e, --seo
51
63
 
52
64
  and OPTIONS is the applicable:
53
- -o, --out FILENAME *Dump output to selected filename --being phased out*
65
+ -o, --out FILENAME *Dump fetch data as CSV*
54
66
  -p, --progress *Outputs a progressbar*
55
67
  -v, --verbose *Output more information*
56
68
  -l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
data/spec/link_spec.rb CHANGED
@@ -2,8 +2,8 @@ require 'retriever'
2
2
 
3
3
  describe "Link" do
4
4
 
5
- r = Retriever::Fetch.new("http://www.cnet.com/reviews/", {})
6
- let(:links) { r.fetchLinks(@source) }
5
+ t = Retriever::Target.new("http://www.cnet.com/reviews/")
6
+ let(:links) { Retriever::Page.new(@source,t).links }
7
7
 
8
8
  it "collects links in anchor tags" do
9
9
  @source = (<<SOURCE).strip
data/spec/page_spec.rb ADDED
@@ -0,0 +1,94 @@
1
+ require 'retriever/page'
2
+ require 'retriever/fetch'
3
+
4
+ t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
5
+
6
+ describe "Page" do
7
+
8
+ describe "#links" do
9
+ let (:links){Retriever::Page.new(@source,t).links}
10
+ it "collects all unique href links on the page" do
11
+ @source = (<<SOURCE).strip
12
+ <a href='www.cnet.com/download.exe'>download</a>
13
+ <a href='/test.html'>test</a>
14
+ <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
15
+ <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
16
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
17
+ SOURCE
18
+
19
+ expect(links).to have(4).items
20
+ end
21
+ end
22
+
23
+ describe "#parseInternal" do
24
+ let (:links){Retriever::Page.new(@source,t).parseInternal}
25
+ it "filters links by host" do
26
+ @source = (<<SOURCE).strip
27
+ <a href='http://www.cnet.com/'>download</a>
28
+ <a href='http://www.yahoo.com/test/'>yahoo</a>
29
+ SOURCE
30
+
31
+ expect(links).to have(1).items
32
+ end
33
+ end
34
+
35
+ describe "#parseInternalVisitable" do
36
+ let (:links){Retriever::Page.new(@source,t).parseInternalVisitable}
37
+ it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
38
+ @source = (<<SOURCE).strip
39
+ <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
40
+ SOURCE
41
+ expect(links).to have(0).items
42
+ end
43
+ end
44
+
45
+ describe "#parseFiles" do
46
+ let (:links){Retriever::Page.new(@source,t).parseFiles}
47
+ it "filters links by filetype" do
48
+ @source = (<<SOURCE).strip
49
+ <a href='www.cnet.com/download.exe'>download</a>
50
+ http://www.google.com
51
+ <a href='/test.html'>test</a>
52
+ SOURCE
53
+ expect(links).to have(1).items
54
+ end
55
+ end
56
+
57
+ describe "#title" do
58
+ let (:page){Retriever::Page.new(@source,t)}
59
+ it "returns page title" do
60
+ @source = (<<SOURCE).strip
61
+ <title>test</title>
62
+ SOURCE
63
+ expect(page.title).to eq('test')
64
+ end
65
+ end
66
+ describe "#desc" do
67
+ let (:page){Retriever::Page.new(@source,t)}
68
+ it "returns meta description" do
69
+ @source = (<<SOURCE).strip
70
+ <meta name='description' content="test2 ">
71
+ SOURCE
72
+ expect(page.desc).to eq('test2 ')
73
+ end
74
+ end
75
+ describe "#h1" do
76
+ let (:page){Retriever::Page.new(@source,t)}
77
+ it "returns h1 text" do
78
+ @source = (<<SOURCE).strip
79
+ <h1>test 3</h1>
80
+ SOURCE
81
+ expect(page.h1).to eq('test 3')
82
+ end
83
+ end
84
+ describe "#h2" do
85
+ let (:page){Retriever::Page.new(@source,t)}
86
+ it "returns h2 text" do
87
+ @source = (<<SOURCE).strip
88
+ <h2> test 4 </h2>
89
+ SOURCE
90
+ expect(page.h2).to eq(' test 4 ')
91
+ end
92
+ end
93
+
94
+ end
@@ -1,52 +1,5 @@
1
1
  require 'retriever'
2
2
 
3
- r = Retriever::Fetch.new("http://www.cnet.com/reviews/",{:file_ext => "exe",:maxpages => "100"})
4
- test_html = "<a href='www.cnet.com/download.exe'>download</a>
5
- http://www.google.com
6
- <a href='/test.html'>test</a>
7
- <a href='http://www.cnet.com/products/gadgets#view-comments'>gadgets comments</a>
8
- <a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'></a>
9
- <a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
10
- <a href='http://www.yahoo.com/test/'>yahoo</a>
11
- test.com
12
- <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
13
- <a href='cpage_18'>about</a>"
14
-
15
- links_collection = r.fetchLinks(test_html)
16
-
17
3
  describe "Fetch" do
18
-
19
- describe "#fetchLinks" do
20
- it "collects all unique href links on the page" do
21
- expect(links_collection).to have(6).items
22
- end
23
- end
24
-
25
- describe "#parseInternalLinks" do
26
- let (:filtered_links) {r.parseInternalLinks(links_collection)}
27
- it "filters links by host" do
28
- filtered_links.each do |link|
29
- expect(link).to include("www.cnet.com")
30
- end
31
- end
32
- end
33
-
34
- describe "#parseInternalVisitableLinks" do
35
- let (:filtered_links) {r.parseInternalVisitableLinks(links_collection)}
36
- it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
37
- filtered_links.each do |link|
38
- expect(link).to_not (include(".css",".js",".png",".gif",".jpg"))
39
- end
40
- end
41
- end
42
-
43
- describe "#parseFiles" do
44
- let(:file_list) {r.parseFiles(links_collection)}
45
- it "filters links by filetype" do
46
- file_list.each do |link|
47
- expect(link).to include(".exe")
48
- end
49
- end
50
- end
51
-
4
+
52
5
  end
data/spec/target_spec.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require 'retriever'
2
2
  require 'open-uri'
3
3
 
4
- t = Retriever::Target.new("http://www.cnet.com/reviews/")
4
+ t = Retriever::Target.new("http://www.cnet.com/reviews/",/\.exe\z/)
5
5
 
6
6
  describe "Target" do
7
7
 
@@ -14,7 +14,11 @@ describe "Target" do
14
14
  end
15
15
 
16
16
  it "creates host_re var" do
17
- expect(t.host_re).to eq(/www.cnet.com/)
17
+ expect(t.host_re).to eq(/cnet.com/)
18
+ end
19
+
20
+ it "creates file_re var (when provided)" do
21
+ expect(t.file_re).to eq(/\.exe\z/)
18
22
  end
19
23
 
20
24
  it "adds protocol to Target URL if none given" do
@@ -34,6 +38,7 @@ describe "Target" do
34
38
  it "fails if target redirects to new host" do
35
39
  expect{Retriever::Target.new("http://tinyurl.com/nkfkypa").source}.to raise_error
36
40
  end
41
+
37
42
  end
38
43
 
39
44
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rubyretriever
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Norton
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-26 00:00:00.000000000 Z
11
+ date: 2014-06-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-synchrony
@@ -119,18 +119,23 @@ files:
119
119
  - LICENSE
120
120
  - bin/rr
121
121
  - lib/retriever.rb
122
+ - lib/retriever/cli.rb
122
123
  - lib/retriever/fetch.rb
123
124
  - lib/retriever/fetchfiles.rb
125
+ - lib/retriever/fetchseo.rb
124
126
  - lib/retriever/fetchsitemap.rb
125
127
  - lib/retriever/link.rb
128
+ - lib/retriever/openuri-redirect-patch.rb
129
+ - lib/retriever/page.rb
126
130
  - lib/retriever/target.rb
127
131
  - lib/retriever/version.rb
128
132
  - readme.md
129
133
  - spec/link_spec.rb
134
+ - spec/page_spec.rb
130
135
  - spec/retriever_spec.rb
131
136
  - spec/spec_helper.rb
132
137
  - spec/target_spec.rb
133
- homepage: http://www.softwarebyjoe.com/rubyretriever/
138
+ homepage: http://softwarebyjoe.com/rubyretriever/
134
139
  licenses:
135
140
  - MIT
136
141
  metadata: {}