powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,213 @@
1
+ # Grell
2
+
3
+ [![Build Status](https://travis-ci.org/mdsol/grell.svg?branch=develop)](https://travis-ci.org/mdsol/grell)
4
+
5
+ Grell is a generic crawler for the web written in Ruby.
6
+ It can be used to gather data, test pages in a given domain, etc.
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ ```ruby
13
+ gem 'grell'
14
+ ```
15
+
16
+ And then execute:
17
+
18
+ $ bundle
19
+
20
+ Or install it yourself as:
21
+
22
+ $ gem install grell
23
+
24
+ Grell uses PhantomJS as a browser, you will need to download and install it in your
25
+ system. Check for instructions in http://phantomjs.org/
26
+ Grell has been tested with PhantomJS v2.1.x
27
+
28
+ ## Usage
29
+
30
+ ### Crawling an entire site
31
+
32
+ The main entry point of the library is Grell::Crawler#start_crawling.
33
+ Grell will yield to your code with each page it finds:
34
+
35
+ ```ruby
36
+ require 'grell'
37
+
38
+ crawler = Grell::Crawler.new
39
+ crawler.start_crawling('http://www.google.com') do |page|
40
+ #Grell will keep iterating this block which each unique page it finds
41
+ puts "yes we crawled #{page.url}"
42
+ puts "status: #{page.status}"
43
+ puts "headers: #{page.headers}"
44
+ puts "body: #{page.body}"
45
+ puts "We crawled it at #{page.timestamp}"
46
+ puts "We found #{page.links.size} links"
47
+ puts "page id and parent_id #{page.id}, #{page.parent_id}"
48
+ end
49
+
50
+ ```
51
+
52
+ Grell keeps a list of pages previously crawled and do not visit the same page twice.
53
+ This list is indexed by the complete url, including query parameters.
54
+
55
+ ### Re-retrieving a page
56
+ If you want Grell to revisit a page and return the data to you again,
57
+ return the symbol :retry in your block for the start_crawling method.
58
+ For instance
59
+ ```ruby
60
+ require 'grell'
61
+ crawler = Grell::Crawler.new
62
+ crawler.start_crawling('http://www.google.com') do |current_page|
63
+ if current_page.status == 500 && current_page.retries == 0
64
+ crawler.manager.restart
65
+ :retry
66
+ end
67
+ end
68
+ ```
69
+
70
+ ### Pages' id
71
+
72
+ Each page has an unique id, accessed by the property `id`. Also each page stores the id of the page from which we found this page, accessed by the property `parent_id`.
73
+ The page object generated by accessing the first URL passed to the start_crawling(the root) has a `parent_id` equal to `nil` and an `id` equal to 0.
74
+ Using this information it is possible to construct a directed graph.
75
+
76
+
77
+ ### Restart and quit
78
+
79
+ Grell can be restarted. The current list of visited and yet-to-visit pages list are not modified when restarting
80
+ but the browser is destroyed and recreated, all cookies and local storage are lost. After restarting, crawling is resumed with a
81
+ new browser.
82
+ To destroy the crawler, call the `quit` method. This will free the memory taken in Ruby and destroys the PhantomJS process.
83
+ ```ruby
84
+ require 'grell'
85
+ crawler = Grell::Crawler.new
86
+ crawler.manager.restart # restarts the browser
87
+ crawler.manager.quit # quits and destroys the crawler
88
+ ```
89
+
90
+ ### Options
91
+
92
+ The `Grell:Crawler` class can be passed options to customize its behavior:
93
+ - `logger`: Sets the logger object, for instance `Rails.logger`. Default: `Logger.new(STDOUT)`
94
+ - `on_periodic_restart`: Sets periodic restarts of the crawler each certain number of visits. Default: 100 pages.
95
+ - `allowlist`: Sets a allowlist filter for URLs to be visited. Default: all URLs are allowlisted.
96
+ - `denylist`: Sets a denylist filter for URLs to be avoided. Default: no URL is denylisted.
97
+ - `add_match_block`: Block evaluated to consider if a given page should be part of the pages to be visited. Default: add unique URLs.
98
+ - `evaluate_in_each_page`: Javascript block to be evaluated on each page visited. Default: Nothing evaluated.
99
+
100
+ Grell by default will follow all the links it finds in the site being crawled.
101
+ It will never follow links linking outside your site.
102
+ If you want to further limit the amount of links crawled, you can use
103
+ allowlisting, denylisting or manual filtering.
104
+ Below further details on these and other options.
105
+
106
+
107
+ #### Automatically restarting PhantomJS
108
+ If you are doing a long crawling it is possible that phantomJS gets into an inconsistent state or it starts leaking memory.
109
+ The crawler can be restarted manually by calling `crawler.manager.restart` or automatically by using the
110
+ `on_periodic_restart` configuration key as follows:
111
+
112
+ ```ruby
113
+ require 'grell'
114
+
115
+ crawler = Grell::Crawler.new(on_periodic_restart: { do: my_restart_procedure, each: 200 })
116
+
117
+ crawler.start_crawling('http://www.google.com') do |current_page|
118
+ ...
119
+ endd
120
+ ```
121
+
122
+ This code will setup the crawler to be restarted every 200 pages being crawled and to call `my_restart_procedure`
123
+ between restarts. A restart will destroy the cookies so for instance this custom block can be used to relogin.
124
+
125
+
126
+ #### Allowlisting
127
+
128
+ ```ruby
129
+ require 'grell'
130
+
131
+ crawler = Grell::Crawler.new(allowlist: [/games\/.*/, '/fun'])
132
+ crawler.start_crawling('http://www.google.com')
133
+ ```
134
+
135
+ Grell here will only follow links to games and '/fun' and ignore all
136
+ other links. You can provide a regexp, strings (if any part of the
137
+ string match is allowlisted) or an array with regexps and/or strings.
138
+
139
+ #### Denylisting
140
+
141
+ ```ruby
142
+ require 'grell'
143
+
144
+ crawler = Grell::Crawler.new(denylist: /games\/.*/)
145
+ crawler.start_crawling('http://www.google.com')
146
+ ```
147
+
148
+ Similar to allowlisting. But now Grell will follow every other link in
149
+ this site which does not go to /games/...
150
+
151
+ If you call both allowlist and denylist then both will apply, a link
152
+ has to fullfill both conditions to survive. If you do not call any, then
153
+ all links on this site will be crawled. Think of these methods as
154
+ filters.
155
+
156
+ #### Manual link filtering
157
+
158
+ If you have a more complex use-case, you can modify the list of links
159
+ manually.
160
+ Grell yields the page to you before it adds the links to the list of
161
+ links to visit. So you can modify in your block of code "page.links" to
162
+ add and delete links to instruct Grell to add them to the list of links
163
+ to visit next.
164
+
165
+ #### Custom URL Comparison
166
+ By default, Grell will detect new URLs to visit by comparing the full URL
167
+ with the URLs of the discovered and visited links. This functionality can
168
+ be changed by passing a block of code to Grells `start_crawling` method.
169
+ In the below example, the path of the URLs (instead of the full URL) will
170
+ be compared.
171
+
172
+ ```ruby
173
+ require 'grell'
174
+
175
+ add_match_block = Proc.new do |collection_page, page|
176
+ collection_page.path == page.path
177
+ end
178
+
179
+ crawler = Grell::Crawler.new(add_match_block: add_match_block)
180
+
181
+ crawler.start_crawling('http://www.google.com') do |current_page|
182
+ ...
183
+ end
184
+ ```
185
+
186
+ #### Evaluate script
187
+
188
+ You can evalute a JavaScript snippet in each page before extracting links by passing the snippet to the 'evaluate_in_each_page' option:
189
+
190
+ ```ruby
191
+ require 'grell'
192
+
193
+ crawler = Grell::Crawler.new(evaluate_in_each_page: "typeof jQuery !== 'undefined' && $('.dropdown').addClass('open');")
194
+
195
+ ```
196
+
197
+ ### Errors
198
+ When there is an error in the page or an internal error in the crawler (Javascript crashed the browser, etc). Grell will return with status 404 and the headers will have the following keys:
199
+ - grellStatus: 'Error'
200
+ - errorClass: The class of the error which broke this page.
201
+ - errorMessage: A descriptive message with the information Grell could gather about the error.
202
+
203
+ ## Tests
204
+
205
+ Run the tests with
206
+ ```ruby
207
+ bundle exec rake ci
208
+ ```
209
+
210
+ ## Contributors
211
+ Grell is (c) Medidata Solutions Worldwide and owned by its major contributors:
212
+ * [Teruhide Hoshikawa](https://github.com/thoshikawa-mdsol)
213
+ * [Jordi Polo Carres](https://github.com/jcarres-mdsol)
package/grell/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'kender/tasks'
2
+
@@ -0,0 +1,36 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'grell/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "grell"
8
+ spec.version = Grell::VERSION
9
+ spec.platform = Gem::Platform::RUBY
10
+ spec.authors = ["Jordi Polo Carres"]
11
+ spec.email = ["jcarres@mdsol.com"]
12
+ spec.summary = %q{Ruby web crawler}
13
+ spec.description = %q{Ruby web crawler using PhantomJS}
14
+ spec.homepage = "https://github.com/mdsol/grell"
15
+ spec.license = 'MIT'
16
+
17
+ spec.files = `git ls-files -z`.split("\x0")
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.required_ruby_version = '>= 2.1.8'
23
+
24
+ spec.add_dependency 'capybara', '~> 2.10'
25
+ spec.add_dependency 'poltergeist', '~> 1.11'
26
+
27
+ # spec.add_development_dependency 'bundler', '~> 1.6'
28
+ spec.add_development_dependency 'byebug', '~> 4.0'
29
+ spec.add_development_dependency 'kender', '~> 0.2'
30
+ spec.add_development_dependency 'rake', '~> 10.0'
31
+ spec.add_development_dependency 'webmock', '~> 1.18'
32
+ spec.add_development_dependency 'rspec', '~> 3.5'
33
+ spec.add_development_dependency 'puffing-billy', '~> 0.9'
34
+ spec.add_development_dependency 'timecop', '~> 0.8'
35
+ spec.add_development_dependency 'selenium-webdriver', '~> 2.53.4'
36
+ end
@@ -0,0 +1,44 @@
1
+ module Grell
2
+ # This class setups the driver for capybara. Used internally by the CrawlerManager
3
+ # It uses Portelgeist to control PhantomJS
4
+ class CapybaraDriver
5
+ USER_AGENT = "Mozilla/5.0 (Grell Crawler)".freeze
6
+
7
+ # Returns a poltergeist driver
8
+ def setup_capybara
9
+ @poltergeist_driver = nil
10
+
11
+ # Capybara will not re-run the block if the driver name already exists, so the driver name
12
+ # will have a time integer appended to ensure uniqueness.
13
+ driver_name = "poltergeist_crawler_#{Time.now.to_f}".to_sym
14
+ Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
15
+
16
+ Capybara.register_driver driver_name do |app|
17
+ @poltergeist_driver = Capybara::Poltergeist::Driver.new(app,
18
+ js_errors: false,
19
+ inspector: false,
20
+ phantomjs_logger: FakePoltergeistLogger,
21
+ phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1.2'])
22
+ end
23
+
24
+ Capybara.default_max_wait_time = 3
25
+ Capybara.run_server = false
26
+ Capybara.default_driver = driver_name
27
+ Capybara.current_session.driver.headers = { # The driver gets initialized when modified here
28
+ "DNT" => 1,
29
+ "User-Agent" => USER_AGENT
30
+ }
31
+
32
+ raise 'Poltergeist Driver could not be properly initialized' unless @poltergeist_driver
33
+
34
+ @poltergeist_driver
35
+ end
36
+
37
+ # Poltergeist driver needs a class with this signature. The javascript console.log is sent here.
38
+ # We just discard that information.
39
+ module FakePoltergeistLogger
40
+ def self.puts(*)
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,83 @@
1
+ module Grell
2
+ # This is the class that starts and controls the crawling
3
+ class Crawler
4
+ attr_reader :collection, :manager
5
+
6
+ # Creates a crawler
7
+ # evaluate_in_each_page: javascript block to evaluate in each page we crawl
8
+ # add_match_block: block to evaluate to consider if a page is part of the collection
9
+ # manager_options: options passed to the manager class
10
+ # allowlist: Sets an allowlist filter, allows a regexp, string or array of either to be matched.
11
+ # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
12
+ def initialize(evaluate_in_each_page: nil, add_match_block: nil, allowlist: /.*/, denylist: /a^/, **manager_options)
13
+ @collection = nil
14
+ @manager = CrawlerManager.new(manager_options)
15
+ @evaluate_in_each_page = evaluate_in_each_page
16
+ @add_match_block = add_match_block
17
+ @allowlist_regexp = Regexp.union(allowlist)
18
+ @denylist_regexp = Regexp.union(denylist)
19
+ end
20
+
21
+ # Main method, it starts crawling on the given URL and calls a block for each of the pages found.
22
+ def start_crawling(url, &block)
23
+ Grell.logger.info "GRELL Started crawling"
24
+ @collection = PageCollection.new(@add_match_block)
25
+ @collection.create_page(url, nil)
26
+
27
+ while !@collection.discovered_pages.empty?
28
+ crawl(@collection.next_page, block)
29
+ @manager.check_periodic_restart(@collection)
30
+ end
31
+
32
+ Grell.logger.info "GRELL finished crawling"
33
+ end
34
+
35
+ def crawl(site, block)
36
+ Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
37
+ crawl_site(site)
38
+
39
+ if block # The user of this block can send us a :retry to retry accessing the page
40
+ while crawl_block(block, site) == :retry
41
+ Grell.logger.info "Retrying our visit to #{site.url}"
42
+ crawl_site(site)
43
+ end
44
+ end
45
+
46
+ site.links.each do |url|
47
+ @collection.create_page(url, site.id)
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def crawl_site(site)
54
+ site.navigate
55
+ site.rawpage.page.evaluate_script(@evaluate_in_each_page) if @evaluate_in_each_page
56
+ filter!(site.links)
57
+ add_redirect_url(site)
58
+ end
59
+
60
+ # Treat any exceptions from the block as an unavailable page
61
+ def crawl_block(block, site)
62
+ block.call(site)
63
+ rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
64
+ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
65
+ Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
66
+ site.unavailable_page(404, e)
67
+ end
68
+
69
+ def filter!(links)
70
+ links.select! { |link| link =~ @allowlist_regexp } if @allowlist_regexp
71
+ links.delete_if { |link| link =~ @denylist_regexp } if @denylist_regexp
72
+ end
73
+
74
+ # Store the resulting redirected URL along with the original URL
75
+ def add_redirect_url(site)
76
+ if site.url != site.current_url
77
+ @collection.create_page(site.current_url, site.id)
78
+ end
79
+ end
80
+
81
+ end
82
+
83
+ end
@@ -0,0 +1,84 @@
1
+ module Grell
2
+ # Manages the state of the process crawling, does not care about individual pages but about logging,
3
+ # restarting and quiting the crawler correctly.
4
+ class CrawlerManager
5
+ # logger: logger to use for Grell's messages
6
+ # on_periodic_restart: if set, the driver will restart every :each visits (100 default) and execute the :do block
7
+ # driver_options: Any extra options for the Capybara driver
8
+ def initialize(logger: nil, on_periodic_restart: {}, driver: nil)
9
+ Grell.logger = logger ? logger : Logger.new(STDOUT)
10
+ @periodic_restart_block = on_periodic_restart[:do]
11
+ @periodic_restart_period = on_periodic_restart[:each] || PAGES_TO_RESTART
12
+ @driver = driver || CapybaraDriver.new.setup_capybara
13
+ if @periodic_restart_period <= 0
14
+ Grell.logger.warn "GRELL. Restart option misconfigured with a negative period. Ignoring option."
15
+ end
16
+ end
17
+
18
+ # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
19
+ def restart
20
+ Grell.logger.info "GRELL. Driver restarting"
21
+ @driver.restart
22
+ Grell.logger.info "GRELL. Driver restarted"
23
+ end
24
+
25
+ # Quits the poltergeist driver.
26
+ def quit
27
+ Grell.logger.info "GRELL. Driver quitting"
28
+ @driver.quit
29
+ end
30
+
31
+ # PhantomJS seems to consume memory increasingly as it crawls, periodic restart allows to restart
32
+ # the driver, potentially calling a block.
33
+ def check_periodic_restart(collection)
34
+ return unless @periodic_restart_block
35
+ return unless @periodic_restart_period > 0
36
+ return unless (collection.visited_pages.size % @periodic_restart_period).zero?
37
+ restart
38
+ @periodic_restart_block.call
39
+ end
40
+
41
+ def self.cleanup_all_processes
42
+ PhantomJSManager.new.cleanup_all_processes
43
+ end
44
+
45
+ private
46
+
47
+ PAGES_TO_RESTART = 100 # Default number of pages before we restart the driver.
48
+ KILL_TIMEOUT = 2 # Number of seconds we wait till we kill the process.
49
+
50
+ # Manages the PhantomJS process
51
+ class PhantomJSManager
52
+ def cleanup_all_processes
53
+ pids = running_phantomjs_pids
54
+ return if pids.empty?
55
+ Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}"
56
+ pids.each do |pid|
57
+ Grell.logger.warn "GRELL. Sending KILL to PhantomJS process #{pid}"
58
+ kill_process(pid.to_i)
59
+ end
60
+ end
61
+
62
+ def running_phantomjs_pids
63
+ list_phantomjs_processes_cmd = "ps -ef | grep -E 'bin/phantomjs' | grep -v grep"
64
+ `#{list_phantomjs_processes_cmd} | awk '{print $2;}'`.split("\n")
65
+ end
66
+
67
+ def kill_process(pid)
68
+ Process.kill('TERM', pid)
69
+ force_kill(pid)
70
+ rescue Errno::ESRCH, Errno::ECHILD
71
+ # successfully terminated
72
+ rescue => e
73
+ Grell.logger.error ["GRELL. PhantomJS process could not be killed", e.message, *e.backtrace].join($/)
74
+ end
75
+
76
+ def force_kill(pid)
77
+ Timeout.timeout(KILL_TIMEOUT) { Process.wait(pid) }
78
+ rescue Timeout::Error
79
+ Process.kill('KILL', pid)
80
+ Process.wait(pid)
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,10 @@
1
+ require 'logger'
2
+
3
+ #Very simple global logger for our crawler.
4
+ module Grell
5
+ class << self
6
+ attr_accessor :logger
7
+ end
8
+ end
9
+
10
+ Grell.logger = Logger.new(STDOUT)