powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,268 @@
1
+ require 'em-synchrony'
2
+ require 'em-synchrony/em-http'
3
+ require 'em-synchrony/fiber_iterator'
4
+ require 'ruby-progressbar'
5
+ require 'open-uri'
6
+ require 'csv'
7
+ require 'bloomfilter-rb'
8
+
9
+ module Retriever
10
+ #
11
+ class Fetch
12
+ HR = '###############################'
13
+ attr_reader :max_pages, :t, :result
14
+ # given target URL and RR options, creates a fetch object.
15
+ # There is no direct output
16
+ # this is a parent class that the other fetch classes build off of.
17
+ def initialize(url, options)
18
+ @iterator = false
19
+ @result = []
20
+ @connection_tally = {
21
+ success: 0,
22
+ error: 0,
23
+ error_client: 0,
24
+ error_server: 0
25
+ }
26
+ setup_options(options)
27
+ setup_progress_bar if @progress
28
+ @t = Retriever::Target.new(url, @file_re)
29
+ @output = "rr-#{@t.host.split('.')[1]}" if @fileharvest && !@output
30
+ @already_crawled = setup_bloom_filter
31
+ end
32
+
33
+ def start
34
+ @page_one = crawl_page_one
35
+ @link_stack = create_link_stack
36
+ @temp_link_stack = []
37
+ end
38
+
39
+ def errlog(msg)
40
+ fail "ERROR: #{msg}"
41
+ end
42
+
43
+ def lg(msg)
44
+ puts "### #{msg}" if @verbose
45
+ end
46
+
47
+ # prints current data collection to STDOUT
48
+ def dump
49
+ puts HR
50
+ puts "Connection Tally:\n#{@connection_tally}\n#{HR}" if @verbose
51
+ puts "Target URL: #{@t.target}"
52
+ if @sitemap
53
+ puts 'Sitemap'
54
+ elsif @fileharvest
55
+ puts "File harvest by type: #{@fileharvest}"
56
+ elsif @seo
57
+ puts 'SEO Metrics'
58
+ end
59
+ puts "Data Dump -- Object Count: #{@result.size}"
60
+ puts HR
61
+ @result.each do |line|
62
+ puts line
63
+ end
64
+ puts
65
+ end
66
+
67
+ # writes current data collection out to CSV in current directory
68
+ def write
69
+ return false unless @output
70
+ i = 0
71
+ CSV.open("#{@output}.csv", 'w') do |csv|
72
+ if (i == 0) && @seo
73
+ csv << ['URL', 'Page Title', 'Meta Description', 'H1', 'H2']
74
+ i += 1
75
+ end
76
+ @result.each do |entry|
77
+ csv << entry
78
+ end
79
+ end
80
+ puts HR
81
+ puts "File Created: #{@output}.csv"
82
+ puts "Object Count: #{@result.size}"
83
+ puts HR
84
+ puts
85
+ end
86
+
87
+ # returns true is resp is ok to continue
88
+ def good_response?(resp, url)
89
+ return false unless resp
90
+ hdr = resp.response_header
91
+ if hdr.redirection?
92
+ loc = hdr.location
93
+ lg("#{url} Redirected to #{loc}")
94
+ if t.host_re =~ loc
95
+ @temp_link_stack.push(loc) unless @already_crawled.include?(loc)
96
+ lg('--Added to stack for later')
97
+ return false
98
+ end
99
+ lg("Redirection outside of target host. No - go. #{loc}")
100
+ return false
101
+ end
102
+ # lets not continue if unsuccessful connection
103
+ unless hdr.successful?
104
+ lg("UNSUCCESSFUL CONNECTION -- #{url}")
105
+ @connection_tally[:error] += 1
106
+ @connection_tally[:error_server] += 1 if hdr.server_error?
107
+ @connection_tally[:error_client] += 1 if hdr.client_error?
108
+ return false
109
+ end
110
+ # let's not continue if not text/html
111
+ unless hdr['CONTENT_TYPE'] =~ %r{(text/html|application/xhtml+xml)}
112
+ @already_crawled.insert(url)
113
+ lg("Page Not text/html -- #{url}")
114
+ return false
115
+ end
116
+ @connection_tally[:success] += 1
117
+ true
118
+ end
119
+
120
+ def filter_out_querystrings(path)
121
+ if path.include?('?')
122
+ uri = Addressable::URI.parse(path)
123
+ uri.query_values = {}
124
+ return uri.to_s.chomp('?')
125
+ end
126
+ path
127
+ end
128
+
129
+ private
130
+
131
+ def setup_options(options)
132
+ @progress = options['progress']
133
+ @max_pages = options['maxpages'] ? options['maxpages'].to_i : 100
134
+ @verbose = options['verbose']
135
+ @output = options['filename']
136
+ @fileharvest = options['fileharvest']
137
+ @sitemap = options['sitemap']
138
+ @seo = options['seo']
139
+ @autodown = options['autodown']
140
+ @file_re = Regexp.new(/.#{@fileharvest}/).freeze if @fileharvest
141
+ end
142
+
143
+ def setup_bloom_filter
144
+ already_crawled = BloomFilter::Native.new(
145
+ size: 1_000_000,
146
+ hashes: 5,
147
+ seed: 1,
148
+ bucket: 8,
149
+ raise: false
150
+ )
151
+ already_crawled.insert(@t.target)
152
+ already_crawled
153
+ end
154
+
155
+ def setup_progress_bar
156
+ # verbose & progressbar conflict
157
+ errlog('CANNOT RUN VERBOSE & PROGRESSBAR AT SAME TIME') if @verbose
158
+ prgress_vars = {
159
+ title: 'Pages',
160
+ starting_at: 1,
161
+ total: @max_pages,
162
+ format: '%a |%b>%i| %c/%C %t'
163
+ }
164
+ @progressbar = ProgressBar.create(prgress_vars)
165
+ end
166
+
167
+ def crawl_page_one
168
+ page_one = Retriever::Page.new(@t.target, @t.source, @t)
169
+ lg("URL Crawled: #{@t.target}")
170
+ page_one
171
+ end
172
+
173
+ def create_link_stack
174
+ link_stack = @page_one.parse_internal_visitable
175
+ errlog("Bad URL -- #{@t.target}") unless link_stack
176
+ lg("#{link_stack.size - 1} links found")
177
+ link_stack.delete(@t.target)
178
+ link_stack.take(@max_pages) if (link_stack.size + 1) > @max_pages
179
+ link_stack
180
+ end
181
+
182
+ def end_crawl_notice
183
+ notice = "#{HR}\nENDING CRAWL\nCan't find any more links."
184
+ @progressbar.log(notice) if @progress
185
+ lg(notice)
186
+ end
187
+
188
+ # iterates over the existing @link_stack
189
+ # running until we reach the @max_pages value.
190
+ def async_crawl_and_collect(&block)
191
+ while @already_crawled.size < @max_pages
192
+ if @link_stack.empty?
193
+ end_crawl_notice
194
+ break
195
+ end
196
+ new_links_arr = process_link_stack(&block)
197
+ @temp_link_stack = []
198
+ next if new_links_arr.nil? || new_links_arr.empty?
199
+ @link_stack.concat(new_links_arr)
200
+ next unless @sitemap
201
+ @result.concat(new_links_arr)
202
+ end
203
+ @result.uniq!
204
+ end
205
+
206
+ def push_seo_to_result(url, new_page)
207
+ seos = [url]
208
+ seos.concat(new_page.parse_seo)
209
+ @result.push(seos)
210
+ lg('--page SEO scraped')
211
+ end
212
+
213
+ def push_files_to_result(new_page)
214
+ filez = new_page.parse_files(new_page.parse_internal)
215
+ @result.concat(filez) unless filez.empty?
216
+ lg("--#{filez.size} files found")
217
+ end
218
+
219
+ def page_from_response(url, response)
220
+ lg("Page Fetched: #{url}")
221
+ @already_crawled.insert(url)
222
+ if @progress && (@already_crawled.size < @max_pages)
223
+ @progressbar.increment
224
+ end
225
+ Retriever::Page.new(url, response, @t)
226
+ end
227
+
228
+ def new_visitable_links(current_page)
229
+ lg("--#{current_page.links.size} links found")
230
+ current_page.parse_internal_visitable
231
+ end
232
+
233
+ def push_custom_to_result(url, current_page, &block)
234
+ data = block.call current_page
235
+ @result.push(data) unless data.empty?
236
+ lg("-- PageIterator called on: #{url}")
237
+ end
238
+
239
+ # send a new wave of GET requests, using current @link_stack
240
+ # at end of the loop it empties link_stack
241
+ # puts new links into temporary stack
242
+ def process_link_stack(&block)
243
+ EM.synchrony do
244
+ concurrency = 10
245
+ EM::Synchrony::FiberIterator.new(@link_stack, concurrency).each do |url|
246
+ next if @already_crawled.size >= @max_pages
247
+ next if @already_crawled.include?(url)
248
+ resp = EventMachine::HttpRequest.new(url).get
249
+ next unless good_response?(resp, url)
250
+ current_page = page_from_response(url, resp.response)
251
+ # non-link dependent modes
252
+ push_seo_to_result(url, current_page) if @seo
253
+ push_custom_to_result(url, current_page, &block) if @iterator
254
+ next unless current_page.links.size > 0
255
+ @temp_link_stack.push(new_visitable_links(current_page))
256
+ # link dependent modes
257
+ next unless @fileharvest
258
+ push_files_to_result(current_page)
259
+ end
260
+ EventMachine.stop
261
+ end
262
+ # empty the stack. most clean way
263
+ @link_stack = []
264
+ # temp contains redirects + new visitable links
265
+ @temp_link_stack.flatten.uniq!
266
+ end
267
+ end
268
+ end
@@ -0,0 +1,71 @@
1
+ module Retriever
2
+ # receives target url and RR options
3
+ # returns an array of all unique files (based on given filetype)
4
+ # found on the target site
5
+
6
+ class FetchFiles < Fetch
7
+ def initialize(url, options)
8
+ super
9
+ start
10
+ temp_file_collection = @page_one.parse_files(@page_one.parse_internal)
11
+ @result.concat(temp_file_collection) if temp_file_collection.size > 0
12
+ lg("#{@result.size} new files found")
13
+
14
+ async_crawl_and_collect
15
+ # done, make sure progress bar says we are done
16
+ @progressbar.finish if @progress
17
+ @result.sort_by! { |x| x.length }
18
+ end
19
+
20
+ def download_file(path)
21
+ path = filter_out_querystrings(path)
22
+ # given valid url, downloads file to current directory in /rr-downloads/
23
+ arr = path.split('/')
24
+ shortname = arr.pop
25
+ puts "Initiating Download of: #{shortname}"
26
+ File.open(shortname, 'wb') do |saved_file|
27
+ open(path) do |read_file|
28
+ saved_file.write(read_file.read)
29
+ end
30
+ end
31
+ puts ' SUCCESS: Download Complete'
32
+ end
33
+
34
+ def autodownload
35
+ # go through the fetched file URL collection and download each one.
36
+ puts HR
37
+ puts '### Initiating Autodownload...'
38
+ puts HR
39
+ puts "#{@result.count} - #{@file_ext}'s Located"
40
+ puts HR
41
+ move_to_download_dir
42
+ iterate_thru_collection_and_download
43
+ Dir.chdir('..')
44
+ end
45
+
46
+ private
47
+
48
+ def iterate_thru_collection_and_download
49
+ lenn = @result.count
50
+ @result.each_with_index do |entry, i|
51
+ begin
52
+ download_file(entry)
53
+ rescue StandardError
54
+ puts "ERROR: failed to download - #{entry}"
55
+ end
56
+ lg(" File [#{i + 1} of #{lenn}]\n")
57
+ end
58
+ end
59
+
60
+ def move_to_download_dir(dir_name = 'rr-downloads')
61
+ if File.directory?(dir_name)
62
+ Dir.chdir(dir_name)
63
+ else
64
+ puts "creating #{dir_name} Directory"
65
+ Dir.mkdir(dir_name)
66
+ Dir.chdir(dir_name)
67
+ end
68
+ puts "Downloading files to local directory: '/#{dir_name}/'"
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,18 @@
1
+ module Retriever
2
+ #
3
+ class FetchSEO < Fetch
4
+ # receives target url and RR options
5
+ # returns an array of onpage SEO related fields
6
+ # on all unique pages found on the site
7
+ def initialize(url, options)
8
+ super
9
+ start
10
+ @result.push(@page_one.parse_seo)
11
+
12
+ async_crawl_and_collect
13
+ # done, make sure progress bar says we are done
14
+ @progressbar.finish if @progress
15
+ @result.sort_by! { |x| x[0].length }
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,43 @@
1
+ module Retriever
2
+ #
3
+ class FetchSitemap < Fetch
4
+ # receives target URL and RR options
5
+ # returns an array of all unique pages found on the site
6
+ def initialize(url, options)
7
+ super
8
+ start
9
+ @result.push(@t.target)
10
+ @result.concat(@link_stack)
11
+
12
+ async_crawl_and_collect
13
+ # done, make sure progress bar says we are done
14
+ @progressbar.finish if @progress
15
+ @result.sort_by! { |x| x.length } if @result.size > 1
16
+ @result.uniq!
17
+ end
18
+
19
+ # produces valid XML sitemap based on page collection fetched.
20
+ # Writes to current directory.
21
+ def gen_xml
22
+ filename = @t.host.split('.')[1]
23
+ f = File.open("sitemap-#{filename}.xml", 'w+')
24
+ f << "<?xml version='1.0' encoding='UTF-8'?>"
25
+ f << "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>"
26
+ @result.each do |url|
27
+ f << "<url><loc>#{url}</loc></url>"
28
+ end
29
+ f << '</urlset>'
30
+ f.close
31
+ print_file_info(filename)
32
+ end
33
+
34
+ private
35
+
36
+ def print_file_info(filename)
37
+ puts HR
38
+ puts "File Created: sitemap-#{filename}.xml"
39
+ puts "Object Count: #{@result.size}"
40
+ puts HR + "\n"
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,47 @@
1
+ require 'addressable/uri'
2
+ module Retriever
3
+ #
4
+ class Link
5
+ # HTTP_RE = Regexp.new(/^http/i).freeze
6
+ SLASH_RE = Regexp.new(%r(^/{1}[^/])).freeze
7
+ DOUBLE_SLASH_RE = Regexp.new(%r(^/{2}[^/])).freeze
8
+ WWW_DOT_RE = Regexp.new(/^www\./i).freeze
9
+
10
+ def initialize(target_scheme, target_host, this_link, current_url)
11
+ begin
12
+ #this_link = Addressable::URI.encode(this_link) //not necessary; and breaking links
13
+ @link_uri = Addressable::URI.parse(this_link)
14
+ rescue Addressable::URI::InvalidURIError
15
+ dummy = Retriever::Link.new(target_scheme, target_host, target_host, target_host)
16
+ @link_uri = Addressable::URI.parse(dummy.path)
17
+ end
18
+ @scheme = target_scheme
19
+ @host = target_host
20
+ @this_link = @link_uri.to_s
21
+ @current_page_url = current_url
22
+ end
23
+
24
+ def path
25
+ return this_link if link_uri.absolute?
26
+
27
+ return "#{@scheme}://#{this_link}" if WWW_DOT_RE =~ this_link
28
+
29
+ return "#{@scheme}://#{host}#{this_link}" if SLASH_RE =~ this_link
30
+
31
+ # link begins with '//'
32
+ return "#{@scheme}:#{this_link}" if DOUBLE_SLASH_RE =~ this_link
33
+
34
+ # link uses relative path with no slashes at all
35
+ if link_uri.relative?
36
+ if @current_page_url[-1, 1] == "/"
37
+ return "#{@current_page_url}#{this_link}"
38
+ end
39
+ return "#{@current_page_url}/#{this_link}"
40
+ end
41
+ end
42
+
43
+ private
44
+
45
+ attr_reader :this_link, :host, :link_uri, :current_page_url
46
+ end
47
+ end
@@ -0,0 +1,8 @@
1
+ #
2
+ module OpenURI
3
+ # nesc patch otherwise OPENURI blocks redirects to and from https
4
+ def self.redirectable?(uri1, uri2)
5
+ uri1.scheme.downcase == uri2.scheme.downcase ||
6
+ (/\A(?:http|https)\z/i =~ uri1.scheme && /\A(?:http|https)\z/i =~ uri2.scheme)
7
+ end
8
+ end
@@ -0,0 +1,104 @@
1
+ require 'nokogiri'
2
+ require 'addressable/uri'
3
+ #
4
+ using SourceString
5
+ module Retriever
6
+ #
7
+ class Page
8
+ HASH_RE = Regexp.new(/^#/i).freeze
9
+ HTTP_RE = Regexp.new(/^http/i).freeze
10
+ H1_RE = Regexp.new(/<h1>(.*)<\/h1>/i).freeze
11
+ H2_RE = Regexp.new(/<h2>(.*)<\/h2>/i).freeze
12
+ TITLE_RE = Regexp.new(/<title>(.*)<\/title>/i).freeze
13
+ DESC_RE = Regexp.new(/<meta[^>]*name=[\"|\']description[\"|\']
14
+ [^>]*content=[\"]
15
+ (
16
+ [^\"]*
17
+ )
18
+ [\"]
19
+ [^>]
20
+ *>
21
+ /ix).freeze
22
+ HREF_CONTENTS_RE = Regexp.new(/\shref=
23
+ ['|"]
24
+ (
25
+ [^\s]
26
+ [a-z0-9\.\/\:\-\%\+\?\!\=\&\,\:\;\~\_]+
27
+ )
28
+ ['|"]
29
+ [\s|\W]
30
+ /ix).freeze
31
+ NONPAGE_EXT_RE = Regexp.new(/\.
32
+ (?:css|js|png|gif|jpg|mp4|
33
+ wmv|flv|mp3|wav|doc|txt|ico|xml)
34
+ /ix).freeze
35
+
36
+ attr_reader :links, :source, :t, :url
37
+
38
+ def initialize(url, source, t)
39
+ @url = url
40
+ @t = t
41
+ @source = source.encode_utf8_and_replace
42
+ @links = nil
43
+ end
44
+
45
+ # receives page source as string
46
+ # returns array of unique href links
47
+ def links
48
+ return @links if @links
49
+ return false unless @source
50
+ @links = @source.scan(HREF_CONTENTS_RE).map do |match|
51
+ # filter some malformed URLS that come in
52
+ # meant to be a loose filter to catch all reasonable HREF attributes.
53
+ link = match[0]
54
+ next if HASH_RE =~ link
55
+ Link.new(@t.scheme, host_with_port, link, @url).path
56
+ end.compact.uniq
57
+ end
58
+
59
+ def host_with_port
60
+ return @t.host if @t.port.nil?
61
+
62
+ @t.host + ':' + @t.port.to_s
63
+ end
64
+
65
+ def parse_internal
66
+ links.select do |x|
67
+ @t.host == Addressable::URI.parse(Addressable::URI.encode(x)).host
68
+ end
69
+ end
70
+
71
+ def parse_internal_visitable
72
+ parse_internal.select { |x| !(NONPAGE_EXT_RE =~ x) }
73
+ end
74
+
75
+ def parse_files(arr = parse_internal)
76
+ arr.select { |x| @t.file_re =~ x }
77
+ end
78
+
79
+ def parse_by_css(selector)
80
+ nokogiri_doc = Nokogiri::HTML(@source)
81
+ nokogiri_doc.css(selector).text
82
+ end
83
+
84
+ def title
85
+ TITLE_RE =~ @source ? @source.match(TITLE_RE)[1].decode_html : ''
86
+ end
87
+
88
+ def desc
89
+ DESC_RE =~ @source ? @source.match(DESC_RE)[1].decode_html : ''
90
+ end
91
+
92
+ def h1
93
+ H1_RE =~ @source ? @source.match(H1_RE)[1].decode_html : ''
94
+ end
95
+
96
+ def h2
97
+ H2_RE =~ @source ? @source.match(H2_RE)[1].decode_html : ''
98
+ end
99
+
100
+ def parse_seo
101
+ [title, desc, h1, h2]
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,21 @@
1
+ module Retriever
2
+ #
3
+ class PageIterator < Fetch
4
+ # receives target url and RR options, and a block
5
+ # runs the block on all pages during crawl, pushing
6
+ # the returned value of the block onto a result stack
7
+ # the complete data returned from the crawl is accessible thru self.result
8
+ def initialize(url, options, &block)
9
+ super
10
+ start
11
+ fail 'block required for PageIterator' unless block_given?
12
+ @iterator = true
13
+ @result.push(block.call @page_one)
14
+ lg("-- PageIterator crawled- #{url}")
15
+ async_crawl_and_collect(&block)
16
+ # done, make sure progress bar says we are done
17
+ @progressbar.finish if @progress
18
+ @result.sort_by! { |x| x.length } if @result.size > 1
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,47 @@
1
+ require 'open-uri'
2
+ require 'addressable/uri'
3
+
4
+ module Retriever
5
+ #
6
+ class Target
7
+ HTTP_RE = Regexp.new(/^http/i).freeze
8
+
9
+ attr_reader :host, :target, :host_re, :source, :file_re, :scheme, :port
10
+
11
+ def initialize(url, file_re = nil)
12
+ fail 'Bad URL' unless url.include?('.')
13
+ url = "http://#{url}" unless HTTP_RE =~ url
14
+ target_uri = Addressable::URI.parse(Addressable::URI.encode(url))
15
+ @target = target_uri.to_s
16
+ @host = target_uri.host
17
+ @host_re = Regexp.new(@host.sub('www.', ''))
18
+ @file_re ||= file_re
19
+ @scheme = target_uri.scheme
20
+ @port = target_uri.port
21
+ end
22
+
23
+ def source
24
+ resp = open(@target)
25
+ resp_url = resp.base_uri.to_s
26
+ if @target != resp_url
27
+ fail "Domain redirecting: #{resp_url}" unless @host_re =~ resp_url
28
+ # if redirect URL is same host, we want to re-sync @target
29
+ return resync_target_and_return_source(resp_url)
30
+ end
31
+ resp = resp.read
32
+ #
33
+ fail 'Domain is not working. Try the non-WWW version.' if resp == ''
34
+ fail 'Domain not working. Try HTTPS???' unless resp
35
+ # consider using scrub from ruby 2.1? this misses some things
36
+ resp.encode('UTF-8', 'binary', invalid: :replace, undef: :replace)
37
+ end
38
+
39
+ def resync_target_and_return_source(url)
40
+ new_t = Retriever::Target.new(url)
41
+ @target = new_t.target
42
+ @host = new_t.host
43
+ @scheme = new_t.scheme
44
+ new_t.source
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,4 @@
1
+ #
2
+ module Retriever
3
+ VERSION = '1.4.5'
4
+ end
@@ -0,0 +1,15 @@
1
+ require 'retriever/core_ext'
2
+ require 'retriever/fetch'
3
+ require 'retriever/fetchfiles'
4
+ require 'retriever/fetchsitemap'
5
+ require 'retriever/fetchseo'
6
+ require 'retriever/page_iterator'
7
+ require 'retriever/cli'
8
+ require 'retriever/link'
9
+ require 'retriever/target'
10
+ require 'retriever/page'
11
+ require 'retriever/openuri_redirect_patch'
12
+
13
+ #
14
+ module Retriever
15
+ end