powerdlz23 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Spider/README.md +19 -0
- package/Spider/domain.py +18 -0
- package/Spider/general.py +51 -0
- package/Spider/link_finder.py +25 -0
- package/Spider/main.py +50 -0
- package/Spider/spider.py +74 -0
- package/crawler/.formatter.exs +5 -0
- package/crawler/.github/workflows/ci.yml +29 -0
- package/crawler/.recode.exs +33 -0
- package/crawler/.tool-versions +2 -0
- package/crawler/CHANGELOG.md +82 -0
- package/crawler/README.md +198 -0
- package/crawler/architecture.svg +4 -0
- package/crawler/config/config.exs +9 -0
- package/crawler/config/dev.exs +5 -0
- package/crawler/config/test.exs +5 -0
- package/crawler/examples/google_search/scraper.ex +37 -0
- package/crawler/examples/google_search/url_filter.ex +11 -0
- package/crawler/examples/google_search.ex +77 -0
- package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
- package/crawler/lib/crawler/dispatcher.ex +20 -0
- package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
- package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
- package/crawler/lib/crawler/fetcher/policer.ex +77 -0
- package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
- package/crawler/lib/crawler/fetcher/requester.ex +32 -0
- package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
- package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
- package/crawler/lib/crawler/fetcher.ex +81 -0
- package/crawler/lib/crawler/http.ex +7 -0
- package/crawler/lib/crawler/linker/path_builder.ex +71 -0
- package/crawler/lib/crawler/linker/path_expander.ex +59 -0
- package/crawler/lib/crawler/linker/path_finder.ex +106 -0
- package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
- package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
- package/crawler/lib/crawler/linker.ex +173 -0
- package/crawler/lib/crawler/options.ex +127 -0
- package/crawler/lib/crawler/parser/css_parser.ex +37 -0
- package/crawler/lib/crawler/parser/guarder.ex +38 -0
- package/crawler/lib/crawler/parser/html_parser.ex +41 -0
- package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
- package/crawler/lib/crawler/parser/link_parser.ex +50 -0
- package/crawler/lib/crawler/parser.ex +122 -0
- package/crawler/lib/crawler/queue_handler.ex +45 -0
- package/crawler/lib/crawler/scraper.ex +28 -0
- package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
- package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
- package/crawler/lib/crawler/snapper.ex +82 -0
- package/crawler/lib/crawler/store/counter.ex +19 -0
- package/crawler/lib/crawler/store/page.ex +7 -0
- package/crawler/lib/crawler/store.ex +87 -0
- package/crawler/lib/crawler/worker.ex +62 -0
- package/crawler/lib/crawler.ex +91 -0
- package/crawler/mix.exs +78 -0
- package/crawler/mix.lock +40 -0
- package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
- package/crawler/test/integration_test.exs +135 -0
- package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
- package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
- package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
- package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
- package/crawler/test/lib/crawler/http_test.exs +47 -0
- package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
- package/crawler/test/lib/crawler/linker_test.exs +7 -0
- package/crawler/test/lib/crawler/options_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser_test.exs +8 -0
- package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
- package/crawler/test/lib/crawler/scraper_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper_test.exs +9 -0
- package/crawler/test/lib/crawler/worker_test.exs +5 -0
- package/crawler/test/lib/crawler_test.exs +295 -0
- package/crawler/test/support/test_case.ex +24 -0
- package/crawler/test/support/test_helpers.ex +28 -0
- package/crawler/test/test_helper.exs +7 -0
- package/package.json +1 -1
- package/rubyretriever/.rspec +2 -0
- package/rubyretriever/.travis.yml +7 -0
- package/rubyretriever/Gemfile +3 -0
- package/rubyretriever/Gemfile.lock +64 -0
- package/rubyretriever/LICENSE +20 -0
- package/rubyretriever/Rakefile +7 -0
- package/rubyretriever/bin/rr +79 -0
- package/rubyretriever/lib/retriever/cli.rb +25 -0
- package/rubyretriever/lib/retriever/core_ext.rb +13 -0
- package/rubyretriever/lib/retriever/fetch.rb +268 -0
- package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
- package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
- package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
- package/rubyretriever/lib/retriever/link.rb +47 -0
- package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
- package/rubyretriever/lib/retriever/page.rb +104 -0
- package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
- package/rubyretriever/lib/retriever/target.rb +47 -0
- package/rubyretriever/lib/retriever/version.rb +4 -0
- package/rubyretriever/lib/retriever.rb +15 -0
- package/rubyretriever/readme.md +166 -0
- package/rubyretriever/rubyretriever.gemspec +41 -0
- package/rubyretriever/spec/link_spec.rb +77 -0
- package/rubyretriever/spec/page_spec.rb +94 -0
- package/rubyretriever/spec/retriever_spec.rb +84 -0
- package/rubyretriever/spec/spec_helper.rb +17 -0
- package/rubyretriever/spec/target_spec.rb +55 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
require 'em-synchrony'
|
|
2
|
+
require 'em-synchrony/em-http'
|
|
3
|
+
require 'em-synchrony/fiber_iterator'
|
|
4
|
+
require 'ruby-progressbar'
|
|
5
|
+
require 'open-uri'
|
|
6
|
+
require 'csv'
|
|
7
|
+
require 'bloomfilter-rb'
|
|
8
|
+
|
|
9
|
+
module Retriever
|
|
10
|
+
#
|
|
11
|
+
class Fetch
|
|
12
|
+
HR = '###############################'
|
|
13
|
+
attr_reader :max_pages, :t, :result
|
|
14
|
+
# given target URL and RR options, creates a fetch object.
|
|
15
|
+
# There is no direct output
|
|
16
|
+
# this is a parent class that the other fetch classes build off of.
|
|
17
|
+
def initialize(url, options)
|
|
18
|
+
@iterator = false
|
|
19
|
+
@result = []
|
|
20
|
+
@connection_tally = {
|
|
21
|
+
success: 0,
|
|
22
|
+
error: 0,
|
|
23
|
+
error_client: 0,
|
|
24
|
+
error_server: 0
|
|
25
|
+
}
|
|
26
|
+
setup_options(options)
|
|
27
|
+
setup_progress_bar if @progress
|
|
28
|
+
@t = Retriever::Target.new(url, @file_re)
|
|
29
|
+
@output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
|
|
30
|
+
@already_crawled = setup_bloom_filter
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def start
|
|
34
|
+
@page_one = crawl_page_one
|
|
35
|
+
@link_stack = create_link_stack
|
|
36
|
+
@temp_link_stack = []
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def errlog(msg)
|
|
40
|
+
fail "ERROR: #{msg}"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def lg(msg)
|
|
44
|
+
puts "### #{msg}" if @verbose
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# prints current data collection to STDOUT
|
|
48
|
+
def dump
|
|
49
|
+
puts HR
|
|
50
|
+
puts "Connection Tally:\n#{@connection_tally}\n#{HR}" if @verbose
|
|
51
|
+
puts "Target URL: #{@t.target}"
|
|
52
|
+
if @sitemap
|
|
53
|
+
puts 'Sitemap'
|
|
54
|
+
elsif @fileharvest
|
|
55
|
+
puts "File harvest by type: #{@fileharvest}"
|
|
56
|
+
elsif @seo
|
|
57
|
+
puts 'SEO Metrics'
|
|
58
|
+
end
|
|
59
|
+
puts "Data Dump -- Object Count: #{@result.size}"
|
|
60
|
+
puts HR
|
|
61
|
+
@result.each do |line|
|
|
62
|
+
puts line
|
|
63
|
+
end
|
|
64
|
+
puts
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# writes current data collection out to CSV in current directory
|
|
68
|
+
def write
|
|
69
|
+
return false unless @output
|
|
70
|
+
i = 0
|
|
71
|
+
CSV.open("#{@output}.csv", 'w') do |csv|
|
|
72
|
+
if (i == 0) && @seo
|
|
73
|
+
csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
|
|
74
|
+
i += 1
|
|
75
|
+
end
|
|
76
|
+
@result.each do |entry|
|
|
77
|
+
csv << entry
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
puts HR
|
|
81
|
+
puts "File Created: #{@output}.csv"
|
|
82
|
+
puts "Object Count: #{@result.size}"
|
|
83
|
+
puts HR
|
|
84
|
+
puts
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# returns true is resp is ok to continue
|
|
88
|
+
def good_response?(resp, url)
|
|
89
|
+
return false unless resp
|
|
90
|
+
hdr = resp.response_header
|
|
91
|
+
if hdr.redirection?
|
|
92
|
+
loc = hdr.location
|
|
93
|
+
lg("#{url} Redirected to #{loc}")
|
|
94
|
+
if t.host_re =~ loc
|
|
95
|
+
@temp_link_stack.push(loc) unless @already_crawled.include?(loc)
|
|
96
|
+
lg('--Added to stack for later')
|
|
97
|
+
return false
|
|
98
|
+
end
|
|
99
|
+
lg("Redirection outside of target host. No - go. #{loc}")
|
|
100
|
+
return false
|
|
101
|
+
end
|
|
102
|
+
# lets not continue if unsuccessful connection
|
|
103
|
+
unless hdr.successful?
|
|
104
|
+
lg("UNSUCCESSFUL CONNECTION -- #{url}")
|
|
105
|
+
@connection_tally[:error] += 1
|
|
106
|
+
@connection_tally[:error_server] += 1 if hdr.server_error?
|
|
107
|
+
@connection_tally[:error_client] += 1 if hdr.client_error?
|
|
108
|
+
return false
|
|
109
|
+
end
|
|
110
|
+
# let's not continue if not text/html
|
|
111
|
+
unless hdr['CONTENT_TYPE'] =~ %r{(text/html|application/xhtml+xml)}
|
|
112
|
+
@already_crawled.insert(url)
|
|
113
|
+
lg("Page Not text/html -- #{url}")
|
|
114
|
+
return false
|
|
115
|
+
end
|
|
116
|
+
@connection_tally[:success] += 1
|
|
117
|
+
true
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def filter_out_querystrings(path)
|
|
121
|
+
if path.include?('?')
|
|
122
|
+
uri = Addressable::URI.parse(path)
|
|
123
|
+
uri.query_values = {}
|
|
124
|
+
return uri.to_s.chomp('?')
|
|
125
|
+
end
|
|
126
|
+
path
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
private
|
|
130
|
+
|
|
131
|
+
def setup_options(options)
|
|
132
|
+
@progress = options['progress']
|
|
133
|
+
@max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
|
|
134
|
+
@verbose = options['verbose']
|
|
135
|
+
@output = options['filename']
|
|
136
|
+
@fileharvest = options['fileharvest']
|
|
137
|
+
@sitemap = options['sitemap']
|
|
138
|
+
@seo = options['seo']
|
|
139
|
+
@autodown = options['autodown']
|
|
140
|
+
@file_re = Regexp.new(/.#{@fileharvest}/).freeze if @fileharvest
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def setup_bloom_filter
|
|
144
|
+
already_crawled = BloomFilter::Native.new(
|
|
145
|
+
size: 1_000_000,
|
|
146
|
+
hashes: 5,
|
|
147
|
+
seed: 1,
|
|
148
|
+
bucket: 8,
|
|
149
|
+
raise: false
|
|
150
|
+
)
|
|
151
|
+
already_crawled.insert(@t.target)
|
|
152
|
+
already_crawled
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def setup_progress_bar
|
|
156
|
+
# verbose & progressbar conflict
|
|
157
|
+
errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME') if @verbose
|
|
158
|
+
prgress_vars = {
|
|
159
|
+
title: 'Pages',
|
|
160
|
+
starting_at: 1,
|
|
161
|
+
total: @max_pages,
|
|
162
|
+
format: '%a |%b>%i| %c/%C %t'
|
|
163
|
+
}
|
|
164
|
+
@progressbar = ProgressBar.create(prgress_vars)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def crawl_page_one
|
|
168
|
+
page_one = Retriever::Page.new(@t.target, @t.source, @t)
|
|
169
|
+
lg("URL Crawled: #{@t.target}")
|
|
170
|
+
page_one
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def create_link_stack
|
|
174
|
+
link_stack = @page_one.parse_internal_visitable
|
|
175
|
+
errlog("Bad URL -- #{@t.target}") unless link_stack
|
|
176
|
+
lg("#{link_stack.size - 1} links found")
|
|
177
|
+
link_stack.delete(@t.target)
|
|
178
|
+
link_stack.take(@max_pages) if (link_stack.size + 1) > @max_pages
|
|
179
|
+
link_stack
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def end_crawl_notice
|
|
183
|
+
notice = "#{HR}\nENDING CRAWL\nCan't find any more links."
|
|
184
|
+
@progressbar.log(notice) if @progress
|
|
185
|
+
lg(notice)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# iterates over the existing @link_stack
|
|
189
|
+
# running until we reach the @max_pages value.
|
|
190
|
+
def async_crawl_and_collect(&block)
|
|
191
|
+
while @already_crawled.size < @max_pages
|
|
192
|
+
if @link_stack.empty?
|
|
193
|
+
end_crawl_notice
|
|
194
|
+
break
|
|
195
|
+
end
|
|
196
|
+
new_links_arr = process_link_stack(&block)
|
|
197
|
+
@temp_link_stack = []
|
|
198
|
+
next if new_links_arr.nil? || new_links_arr.empty?
|
|
199
|
+
@link_stack.concat(new_links_arr)
|
|
200
|
+
next unless @sitemap
|
|
201
|
+
@result.concat(new_links_arr)
|
|
202
|
+
end
|
|
203
|
+
@result.uniq!
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def push_seo_to_result(url, new_page)
|
|
207
|
+
seos = [url]
|
|
208
|
+
seos.concat(new_page.parse_seo)
|
|
209
|
+
@result.push(seos)
|
|
210
|
+
lg('--page SEO scraped')
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def push_files_to_result(new_page)
|
|
214
|
+
filez = new_page.parse_files(new_page.parse_internal)
|
|
215
|
+
@result.concat(filez) unless filez.empty?
|
|
216
|
+
lg("--#{filez.size} files found")
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def page_from_response(url, response)
|
|
220
|
+
lg("Page Fetched: #{url}")
|
|
221
|
+
@already_crawled.insert(url)
|
|
222
|
+
if @progress && (@already_crawled.size < @max_pages)
|
|
223
|
+
@progressbar.increment
|
|
224
|
+
end
|
|
225
|
+
Retriever::Page.new(url, response, @t)
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def new_visitable_links(current_page)
|
|
229
|
+
lg("--#{current_page.links.size} links found")
|
|
230
|
+
current_page.parse_internal_visitable
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def push_custom_to_result(url, current_page, &block)
|
|
234
|
+
data = block.call current_page
|
|
235
|
+
@result.push(data) unless data.empty?
|
|
236
|
+
lg("-- PageIterator called on: #{url}")
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
# send a new wave of GET requests, using current @link_stack
|
|
240
|
+
# at end of the loop it empties link_stack
|
|
241
|
+
# puts new links into temporary stack
|
|
242
|
+
def process_link_stack(&block)
|
|
243
|
+
EM.synchrony do
|
|
244
|
+
concurrency = 10
|
|
245
|
+
EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
|
|
246
|
+
next if @already_crawled.size >= @max_pages
|
|
247
|
+
next if @already_crawled.include?(url)
|
|
248
|
+
resp = EventMachine::HttpRequest.new(url).get
|
|
249
|
+
next unless good_response?(resp, url)
|
|
250
|
+
current_page = page_from_response(url, resp.response)
|
|
251
|
+
# non-link dependent modes
|
|
252
|
+
push_seo_to_result(url, current_page) if @seo
|
|
253
|
+
push_custom_to_result(url, current_page, &block) if @iterator
|
|
254
|
+
next unless current_page.links.size > 0
|
|
255
|
+
@temp_link_stack.push(new_visitable_links(current_page))
|
|
256
|
+
# link dependent modes
|
|
257
|
+
next unless @fileharvest
|
|
258
|
+
push_files_to_result(current_page)
|
|
259
|
+
end
|
|
260
|
+
EventMachine.stop
|
|
261
|
+
end
|
|
262
|
+
# empty the stack. most clean way
|
|
263
|
+
@link_stack = []
|
|
264
|
+
# temp contains redirects + new visitable links
|
|
265
|
+
@temp_link_stack.flatten.uniq!
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
module Retriever
|
|
2
|
+
# receives target url and RR options
|
|
3
|
+
# returns an array of all unique files (based on given filetype)
|
|
4
|
+
# found on the target site
|
|
5
|
+
|
|
6
|
+
class FetchFiles < Fetch
|
|
7
|
+
def initialize(url, options)
|
|
8
|
+
super
|
|
9
|
+
start
|
|
10
|
+
temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
|
|
11
|
+
@result.concat(temp_file_collection) if temp_file_collection.size > 0
|
|
12
|
+
lg("#{@result.size} new files found")
|
|
13
|
+
|
|
14
|
+
async_crawl_and_collect
|
|
15
|
+
# done, make sure progress bar says we are done
|
|
16
|
+
@progressbar.finish if @progress
|
|
17
|
+
@result.sort_by! { |x| x.length }
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def download_file(path)
|
|
21
|
+
path = filter_out_querystrings(path)
|
|
22
|
+
# given valid url, downloads file to current directory in /rr-downloads/
|
|
23
|
+
arr = path.split('/')
|
|
24
|
+
shortname = arr.pop
|
|
25
|
+
puts "Initiating Download of: #{shortname}"
|
|
26
|
+
File.open(shortname, 'wb') do |saved_file|
|
|
27
|
+
open(path) do |read_file|
|
|
28
|
+
saved_file.write(read_file.read)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
puts ' SUCCESS: Download Complete'
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def autodownload
|
|
35
|
+
# go through the fetched file URL collection and download each one.
|
|
36
|
+
puts HR
|
|
37
|
+
puts '### Initiating Autodownload...'
|
|
38
|
+
puts HR
|
|
39
|
+
puts "#{@result.count} - #{@file_ext}'s Located"
|
|
40
|
+
puts HR
|
|
41
|
+
move_to_download_dir
|
|
42
|
+
iterate_thru_collection_and_download
|
|
43
|
+
Dir.chdir('..')
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
def iterate_thru_collection_and_download
|
|
49
|
+
lenn = @result.count
|
|
50
|
+
@result.each_with_index do |entry, i|
|
|
51
|
+
begin
|
|
52
|
+
download_file(entry)
|
|
53
|
+
rescue StandardError
|
|
54
|
+
puts "ERROR: failed to download - #{entry}"
|
|
55
|
+
end
|
|
56
|
+
lg(" File [#{i + 1} of #{lenn}]\n")
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def move_to_download_dir(dir_name = 'rr-downloads')
|
|
61
|
+
if File.directory?(dir_name)
|
|
62
|
+
Dir.chdir(dir_name)
|
|
63
|
+
else
|
|
64
|
+
puts "creating #{dir_name} Directory"
|
|
65
|
+
Dir.mkdir(dir_name)
|
|
66
|
+
Dir.chdir(dir_name)
|
|
67
|
+
end
|
|
68
|
+
puts "Downloading files to local directory: '/#{dir_name}/'"
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
module Retriever
|
|
2
|
+
#
|
|
3
|
+
class FetchSEO < Fetch
|
|
4
|
+
# receives target url and RR options
|
|
5
|
+
# returns an array of onpage SEO related fields
|
|
6
|
+
# on all unique pages found on the site
|
|
7
|
+
def initialize(url, options)
|
|
8
|
+
super
|
|
9
|
+
start
|
|
10
|
+
@result.push(@page_one.parse_seo)
|
|
11
|
+
|
|
12
|
+
async_crawl_and_collect
|
|
13
|
+
# done, make sure progress bar says we are done
|
|
14
|
+
@progressbar.finish if @progress
|
|
15
|
+
@result.sort_by! { |x| x[0].length }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module Retriever
|
|
2
|
+
#
|
|
3
|
+
class FetchSitemap < Fetch
|
|
4
|
+
# receives target URL and RR options
|
|
5
|
+
# returns an array of all unique pages found on the site
|
|
6
|
+
def initialize(url, options)
|
|
7
|
+
super
|
|
8
|
+
start
|
|
9
|
+
@result.push(@t.target)
|
|
10
|
+
@result.concat(@link_stack)
|
|
11
|
+
|
|
12
|
+
async_crawl_and_collect
|
|
13
|
+
# done, make sure progress bar says we are done
|
|
14
|
+
@progressbar.finish if @progress
|
|
15
|
+
@result.sort_by! { |x| x.length } if @result.size > 1
|
|
16
|
+
@result.uniq!
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# produces valid XML sitemap based on page collection fetched.
|
|
20
|
+
# Writes to current directory.
|
|
21
|
+
def gen_xml
|
|
22
|
+
filename = @t.host.split('.')[1]
|
|
23
|
+
f = File.open("sitemap-#{filename}.xml", 'w+')
|
|
24
|
+
f << "<?xml version='1.0' encoding='UTF-8'?>"
|
|
25
|
+
f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
|
|
26
|
+
@result.each do |url|
|
|
27
|
+
f << "<url><loc>#{url}</loc></url>"
|
|
28
|
+
end
|
|
29
|
+
f << '</urlset>'
|
|
30
|
+
f.close
|
|
31
|
+
print_file_info(filename)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def print_file_info(filename)
|
|
37
|
+
puts HR
|
|
38
|
+
puts "File Created: sitemap-#{filename}.xml"
|
|
39
|
+
puts "Object Count: #{@result.size}"
|
|
40
|
+
puts HR + "\n"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require 'addressable/uri'
|
|
2
|
+
module Retriever
|
|
3
|
+
#
|
|
4
|
+
class Link
|
|
5
|
+
# HTTP_RE = Regexp.new(/^http/i).freeze
|
|
6
|
+
SLASH_RE = Regexp.new(%r(^/{1}[^/])).freeze
|
|
7
|
+
DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
|
|
8
|
+
WWW_DOT_RE = Regexp.new(/^www\./i).freeze
|
|
9
|
+
|
|
10
|
+
def initialize(target_scheme, target_host, this_link, current_url)
|
|
11
|
+
begin
|
|
12
|
+
#this_link = Addressable::URI.encode(this_link) //not necessary; and breaking links
|
|
13
|
+
@link_uri = Addressable::URI.parse(this_link)
|
|
14
|
+
rescue Addressable::URI::InvalidURIError
|
|
15
|
+
dummy = Retriever::Link.new(target_scheme, target_host, target_host, target_host)
|
|
16
|
+
@link_uri = Addressable::URI.parse(dummy.path)
|
|
17
|
+
end
|
|
18
|
+
@scheme = target_scheme
|
|
19
|
+
@host = target_host
|
|
20
|
+
@this_link = @link_uri.to_s
|
|
21
|
+
@current_page_url = current_url
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def path
|
|
25
|
+
return this_link if link_uri.absolute?
|
|
26
|
+
|
|
27
|
+
return "#{@scheme}://#{this_link}" if WWW_DOT_RE =~ this_link
|
|
28
|
+
|
|
29
|
+
return "#{@scheme}://#{host}#{this_link}" if SLASH_RE =~ this_link
|
|
30
|
+
|
|
31
|
+
# link begins with '//'
|
|
32
|
+
return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
|
|
33
|
+
|
|
34
|
+
# link uses relative path with no slashes at all
|
|
35
|
+
if link_uri.relative?
|
|
36
|
+
if @current_page_url[-1, 1] == "/"
|
|
37
|
+
return "#{@current_page_url}#{this_link}"
|
|
38
|
+
end
|
|
39
|
+
return "#{@current_page_url}/#{this_link}"
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
attr_reader :this_link, :host, :link_uri, :current_page_url
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
#
|
|
2
|
+
module OpenURI
|
|
3
|
+
# nesc patch otherwise OPENURI blocks redirects to and from https
|
|
4
|
+
def self.redirectable?(uri1, uri2)
|
|
5
|
+
uri1.scheme.downcase == uri2.scheme.downcase ||
|
|
6
|
+
(/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
|
|
7
|
+
end
|
|
8
|
+
end
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
require 'nokogiri'
|
|
2
|
+
require 'addressable/uri'
|
|
3
|
+
#
|
|
4
|
+
using SourceString
|
|
5
|
+
module Retriever
|
|
6
|
+
#
|
|
7
|
+
class Page
|
|
8
|
+
HASH_RE = Regexp.new(/^#/i).freeze
|
|
9
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
|
10
|
+
H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
|
|
11
|
+
H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
|
|
12
|
+
TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
|
|
13
|
+
DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
|
|
14
|
+
[^>]*content=[\"]
|
|
15
|
+
(
|
|
16
|
+
[^\"]*
|
|
17
|
+
)
|
|
18
|
+
[\"]
|
|
19
|
+
[^>]
|
|
20
|
+
*>
|
|
21
|
+
/ix).freeze
|
|
22
|
+
HREF_CONTENTS_RE = Regexp.new(/\shref=
|
|
23
|
+
['|"]
|
|
24
|
+
(
|
|
25
|
+
[^\s]
|
|
26
|
+
[a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
|
|
27
|
+
)
|
|
28
|
+
['|"]
|
|
29
|
+
[\s|\W]
|
|
30
|
+
/ix).freeze
|
|
31
|
+
NONPAGE_EXT_RE = Regexp.new(/\.
|
|
32
|
+
(?:css|js|png|gif|jpg|mp4|
|
|
33
|
+
wmv|flv|mp3|wav|doc|txt|ico|xml)
|
|
34
|
+
/ix).freeze
|
|
35
|
+
|
|
36
|
+
attr_reader :links, :source, :t, :url
|
|
37
|
+
|
|
38
|
+
def initialize(url, source, t)
|
|
39
|
+
@url = url
|
|
40
|
+
@t = t
|
|
41
|
+
@source = source.encode_utf8_and_replace
|
|
42
|
+
@links = nil
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# receives page source as string
|
|
46
|
+
# returns array of unique href links
|
|
47
|
+
def links
|
|
48
|
+
return @links if @links
|
|
49
|
+
return false unless @source
|
|
50
|
+
@links = @source.scan(HREF_CONTENTS_RE).map do |match|
|
|
51
|
+
# filter some malformed URLS that come in
|
|
52
|
+
# meant to be a loose filter to catch all reasonable HREF attributes.
|
|
53
|
+
link = match[0]
|
|
54
|
+
next if HASH_RE =~ link
|
|
55
|
+
Link.new(@t.scheme, host_with_port, link, @url).path
|
|
56
|
+
end.compact.uniq
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def host_with_port
|
|
60
|
+
return @t.host if @t.port.nil?
|
|
61
|
+
|
|
62
|
+
@t.host + ':' + @t.port.to_s
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def parse_internal
|
|
66
|
+
links.select do |x|
|
|
67
|
+
@t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def parse_internal_visitable
|
|
72
|
+
parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def parse_files(arr = parse_internal)
|
|
76
|
+
arr.select { |x| @t.file_re =~ x }
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def parse_by_css(selector)
|
|
80
|
+
nokogiri_doc = Nokogiri::HTML(@source)
|
|
81
|
+
nokogiri_doc.css(selector).text
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def title
|
|
85
|
+
TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def desc
|
|
89
|
+
DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : ''
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def h1
|
|
93
|
+
H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : ''
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def h2
|
|
97
|
+
H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : ''
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def parse_seo
|
|
101
|
+
[title, desc, h1, h2]
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module Retriever
|
|
2
|
+
#
|
|
3
|
+
class PageIterator < Fetch
|
|
4
|
+
# receives target url and RR options, and a block
|
|
5
|
+
# runs the block on all pages during crawl, pushing
|
|
6
|
+
# the returned value of the block onto a result stack
|
|
7
|
+
# the complete data returned from the crawl is accessible thru self.result
|
|
8
|
+
def initialize(url, options, &block)
|
|
9
|
+
super
|
|
10
|
+
start
|
|
11
|
+
fail 'block required for PageIterator' unless block_given?
|
|
12
|
+
@iterator = true
|
|
13
|
+
@result.push(block.call @page_one)
|
|
14
|
+
lg("-- PageIterator crawled- #{url}")
|
|
15
|
+
async_crawl_and_collect(&block)
|
|
16
|
+
# done, make sure progress bar says we are done
|
|
17
|
+
@progressbar.finish if @progress
|
|
18
|
+
@result.sort_by! { |x| x.length } if @result.size > 1
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require 'open-uri'
|
|
2
|
+
require 'addressable/uri'
|
|
3
|
+
|
|
4
|
+
module Retriever
|
|
5
|
+
#
|
|
6
|
+
class Target
|
|
7
|
+
HTTP_RE = Regexp.new(/^http/i).freeze
|
|
8
|
+
|
|
9
|
+
attr_reader :host, :target, :host_re, :source, :file_re, :scheme, :port
|
|
10
|
+
|
|
11
|
+
def initialize(url, file_re = nil)
|
|
12
|
+
fail 'Bad URL' unless url.include?('.')
|
|
13
|
+
url = "http://#{url}" unless HTTP_RE =~ url
|
|
14
|
+
target_uri = Addressable::URI.parse(Addressable::URI.encode(url))
|
|
15
|
+
@target = target_uri.to_s
|
|
16
|
+
@host = target_uri.host
|
|
17
|
+
@host_re = Regexp.new(@host.sub('www.', ''))
|
|
18
|
+
@file_re ||= file_re
|
|
19
|
+
@scheme = target_uri.scheme
|
|
20
|
+
@port = target_uri.port
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def source
|
|
24
|
+
resp = open(@target)
|
|
25
|
+
resp_url = resp.base_uri.to_s
|
|
26
|
+
if @target != resp_url
|
|
27
|
+
fail "Domain redirecting: #{resp_url}" unless @host_re =~ resp_url
|
|
28
|
+
# if redirect URL is same host, we want to re-sync @target
|
|
29
|
+
return resync_target_and_return_source(resp_url)
|
|
30
|
+
end
|
|
31
|
+
resp = resp.read
|
|
32
|
+
#
|
|
33
|
+
fail 'Domain is not working. Try the non-WWW version.' if resp == ''
|
|
34
|
+
fail 'Domain not working. Try HTTPS???' unless resp
|
|
35
|
+
# consider using scrub from ruby 2.1? this misses some things
|
|
36
|
+
resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def resync_target_and_return_source(url)
|
|
40
|
+
new_t = Retriever::Target.new(url)
|
|
41
|
+
@target = new_t.target
|
|
42
|
+
@host = new_t.host
|
|
43
|
+
@scheme = new_t.scheme
|
|
44
|
+
new_t.source
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
require 'retriever/core_ext'
|
|
2
|
+
require 'retriever/fetch'
|
|
3
|
+
require 'retriever/fetchfiles'
|
|
4
|
+
require 'retriever/fetchsitemap'
|
|
5
|
+
require 'retriever/fetchseo'
|
|
6
|
+
require 'retriever/page_iterator'
|
|
7
|
+
require 'retriever/cli'
|
|
8
|
+
require 'retriever/link'
|
|
9
|
+
require 'retriever/target'
|
|
10
|
+
require 'retriever/page'
|
|
11
|
+
require 'retriever/openuri_redirect_patch'
|
|
12
|
+
|
|
13
|
+
#
|
|
14
|
+
module Retriever
|
|
15
|
+
end
|