powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,295 @@
1
+ defmodule CrawlerTest do
2
+ use Crawler.TestCase, async: false
3
+
4
+ alias Crawler.Store
5
+
6
+ @moduletag capture_log: true
7
+
8
+ doctest Crawler
9
+
10
+ test ".crawl", %{bypass: bypass, url: url} do
11
+ Store.ops_reset()
12
+
13
+ url = "#{url}/crawler"
14
+ linked_url1 = "#{url}/link1"
15
+ linked_url2 = "#{url}/link2"
16
+ linked_url3 = "#{url}/link3"
17
+ linked_url4 = "#{url}/link4"
18
+
19
+ Bypass.expect_once(bypass, "GET", "/crawler", fn conn ->
20
+ Plug.Conn.resp(conn, 200, """
21
+ <html><a href="#{linked_url1}">1</a></html>
22
+ <html><a href="#{linked_url2}">2</a></html>
23
+ """)
24
+ end)
25
+
26
+ Bypass.expect_once(bypass, "GET", "/crawler/link1", fn conn ->
27
+ Plug.Conn.resp(conn, 200, """
28
+ <html><a href="#{linked_url2}">2</a></html>
29
+ """)
30
+ end)
31
+
32
+ Bypass.expect_once(bypass, "GET", "/crawler/link2", fn conn ->
33
+ Plug.Conn.resp(conn, 200, """
34
+ <html><a href="#{linked_url3}">3</a></html>
35
+ """)
36
+ end)
37
+
38
+ Bypass.expect_once(bypass, "GET", "/crawler/link3", fn conn ->
39
+ Plug.Conn.resp(conn, 200, """
40
+ <html><a href="#{linked_url4}">4</a></html>
41
+ """)
42
+ end)
43
+
44
+ {:ok, opts} = Crawler.crawl(url, max_depths: 3, workers: 3, interval: 100, store: Store)
45
+
46
+ assert Crawler.running?(opts)
47
+
48
+ Crawler.pause(opts)
49
+
50
+ refute Crawler.running?(opts)
51
+
52
+ assert opts[:workers] == 3
53
+
54
+ Crawler.resume(opts)
55
+
56
+ assert Crawler.running?(opts)
57
+
58
+ wait(fn ->
59
+ assert Store.ops_count() == 4
60
+ end)
61
+
62
+ wait(fn ->
63
+ assert %Store.Page{url: ^url, opts: %{workers: 3}} = Store.find_processed({url, nil})
64
+
65
+ assert Store.find_processed({linked_url1, nil})
66
+ assert Store.find_processed({linked_url2, nil})
67
+ assert Store.find_processed({linked_url3, nil})
68
+ refute Store.find({linked_url4, nil})
69
+
70
+ urls = Crawler.Store.all_urls()
71
+
72
+ assert Enum.member?(urls, {url, nil})
73
+ assert Enum.member?(urls, {linked_url1, nil})
74
+ assert Enum.member?(urls, {linked_url2, nil})
75
+ assert Enum.member?(urls, {linked_url3, nil})
76
+ refute Enum.member?(urls, {linked_url4, nil})
77
+ end)
78
+
79
+ wait(fn ->
80
+ refute Crawler.running?(opts)
81
+ assert OPQ.info(opts[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 3}
82
+ end)
83
+ end
84
+
85
+ test ".crawl without a store", %{bypass: bypass, url: url} do
86
+ url = "#{url}/crawler_without_store"
87
+
88
+ Bypass.expect_once(bypass, "GET", "/crawler_without_store", fn conn ->
89
+ Plug.Conn.resp(conn, 200, "200")
90
+ end)
91
+
92
+ {:ok, opts} = Crawler.crawl(url, max_depths: 1, workers: 1, interval: 100, store: nil)
93
+
94
+ wait(fn ->
95
+ assert %Store.Page{url: ^url, body: nil, opts: nil} = Store.find_processed({url, nil})
96
+ end)
97
+
98
+ wait(fn ->
99
+ assert OPQ.info(opts[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 1}
100
+ end)
101
+ end
102
+
103
+ test ".crawl with max_pages", %{bypass: bypass, url: url} do
104
+ Store.ops_reset()
105
+
106
+ url = "#{url}/crawler_with_max_pages"
107
+ linked_url1 = "#{url}/link1"
108
+ linked_url2 = "#{url}/link2"
109
+ linked_url3 = "#{url}/link3"
110
+ linked_url4 = "#{url}/link4"
111
+ linked_url5 = "#{url}/link5"
112
+
113
+ Bypass.expect_once(bypass, "GET", "/crawler_with_max_pages", fn conn ->
114
+ Plug.Conn.resp(conn, 200, """
115
+ <html><a href="#{linked_url1}">1</a></html>
116
+ <html><a href="#{linked_url2}">2</a></html>
117
+ <html><a href="#{linked_url3}">3</a></html>
118
+ <html><a href="#{linked_url4}">4</a></html>
119
+ <html><a href="#{linked_url5}">5</a></html>
120
+ """)
121
+ end)
122
+
123
+ Bypass.expect_once(bypass, "GET", "/crawler_with_max_pages/link1", fn conn ->
124
+ Plug.Conn.resp(conn, 200, """
125
+ <html><a href="#{linked_url2}">2</a></html>
126
+ <html><a href="#{linked_url3}">3</a></html>
127
+ """)
128
+ end)
129
+
130
+ Bypass.expect_once(bypass, "GET", "/crawler_with_max_pages/link2", fn conn ->
131
+ Plug.Conn.resp(conn, 200, """
132
+ <html><a href="#{linked_url3}">3</a></html>
133
+ <html><a href="#{linked_url4}">4</a></html>
134
+ <html><a href="#{linked_url5}">5</a></html>
135
+ """)
136
+ end)
137
+
138
+ Bypass.stub(bypass, "GET", "/crawler_with_max_pages/link3", fn conn ->
139
+ Plug.Conn.resp(conn, 200, """
140
+ <html><a href="#{linked_url3}">3</a></html>
141
+ <html><a href="#{linked_url4}">4</a></html>
142
+ <html><a href="#{linked_url5}">5</a></html>
143
+ """)
144
+ end)
145
+
146
+ Bypass.stub(bypass, "GET", "/crawler_with_max_pages/link4", fn conn ->
147
+ Plug.Conn.resp(conn, 200, """
148
+ <html><a href="#{linked_url3}">3</a></html>
149
+ <html><a href="#{linked_url4}">4</a></html>
150
+ <html><a href="#{linked_url5}">5</a></html>
151
+ """)
152
+ end)
153
+
154
+ {:ok, opts} = Crawler.crawl(url, max_depths: 3, force: true, workers: 4, max_pages: 3, interval: 100)
155
+
156
+ wait(fn ->
157
+ assert Store.ops_count() == 4
158
+ end)
159
+
160
+ wait(fn ->
161
+ assert Store.find_processed({url, opts[:scope]})
162
+ assert Store.find_processed({linked_url1, opts[:scope]})
163
+ assert Store.find_processed({linked_url2, opts[:scope]})
164
+ assert Store.find_processed({linked_url3, opts[:scope]})
165
+ refute Store.find({linked_url4, opts[:scope]})
166
+ refute Store.find({linked_url5, opts[:scope]})
167
+ end)
168
+
169
+ wait(fn ->
170
+ assert OPQ.info(opts[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 4}
171
+ end)
172
+ end
173
+
174
+ test ".crawl with an existing queue", %{bypass: bypass, url: url} do
175
+ Store.ops_reset()
176
+
177
+ url = "#{url}/crawler_with_queue"
178
+ linked_url1 = "#{url}/link1"
179
+ linked_url2 = "#{url}/link2"
180
+ linked_url3 = "#{url}/link3"
181
+ linked_url4 = "#{url}/link4"
182
+
183
+ Bypass.expect_once(bypass, "GET", "/crawler_with_queue/link1", fn conn ->
184
+ Plug.Conn.resp(conn, 200, """
185
+ <html><a href="#{linked_url2}">2</a></html>
186
+ """)
187
+ end)
188
+
189
+ Bypass.expect_once(bypass, "GET", "/crawler_with_queue/link2", fn conn ->
190
+ Plug.Conn.resp(conn, 200, """
191
+ <html><a href="#{linked_url3}">3</a></html>
192
+ """)
193
+ end)
194
+
195
+ Bypass.expect_once(bypass, "GET", "/crawler_with_queue/link3", fn conn ->
196
+ Plug.Conn.resp(conn, 200, """
197
+ <html>ok</html>
198
+ """)
199
+ end)
200
+
201
+ {:ok, queue} = OPQ.init(worker: Crawler.Dispatcher.Worker, workers: 2, interval: 100)
202
+
203
+ {:ok, opts1} = Crawler.crawl(linked_url1, store: Store, queue: queue)
204
+ {:ok, opts2} = Crawler.crawl(linked_url2, store: Store, queue: queue)
205
+
206
+ wait(fn ->
207
+ assert Store.ops_count() == 3
208
+ end)
209
+
210
+ wait(fn ->
211
+ assert Store.find_processed({linked_url1, nil})
212
+ assert Store.find_processed({linked_url2, nil})
213
+ assert Store.find_processed({linked_url3, nil})
214
+ refute Store.find_processed({linked_url4, nil})
215
+
216
+ urls = Crawler.Store.all_urls()
217
+
218
+ assert Enum.member?(urls, {linked_url1, nil})
219
+ assert Enum.member?(urls, {linked_url2, nil})
220
+ assert Enum.member?(urls, {linked_url3, nil})
221
+ refute Enum.member?(urls, {linked_url4, nil})
222
+ end)
223
+
224
+ wait(fn ->
225
+ assert OPQ.info(opts1[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
226
+ assert OPQ.info(opts2[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
227
+ end)
228
+ end
229
+
230
+ test ".crawl forced", %{bypass: bypass, url: url} do
231
+ Store.ops_reset()
232
+
233
+ url = "#{url}/crawler_forced"
234
+ linked_url1 = "#{url}/link1"
235
+ linked_url2 = "#{url}/link2"
236
+
237
+ Bypass.expect(bypass, "GET", "/crawler_forced", fn conn ->
238
+ Plug.Conn.resp(conn, 200, """
239
+ <html><a href="#{linked_url1}">1</a></html>
240
+ <html><a href="#{linked_url1}">1</a></html>
241
+ """)
242
+ end)
243
+
244
+ Bypass.expect(bypass, "GET", "/crawler_forced/link1", fn conn ->
245
+ Plug.Conn.resp(conn, 200, """
246
+ <html><a href="#{linked_url2}">2</a></html>
247
+ """)
248
+ end)
249
+
250
+ Bypass.expect(bypass, "GET", "/crawler_forced/link2", fn conn ->
251
+ Plug.Conn.resp(conn, 200, """
252
+ <html>ok</html>
253
+ """)
254
+ end)
255
+
256
+ {:ok, opts1} = Crawler.crawl(url, force: true, workers: 1, interval: 100)
257
+ {:ok, opts2} = Crawler.crawl(url, force: true, workers: 2, interval: 100)
258
+
259
+ refute opts1[:scope] == opts2[:scope]
260
+
261
+ wait(fn ->
262
+ assert Store.find_processed({url, opts1[:scope]})
263
+ assert Store.find_processed({url, opts2[:scope]})
264
+ assert Store.find_processed({linked_url1, opts1[:scope]})
265
+ assert Store.find_processed({linked_url1, opts2[:scope]})
266
+ assert Store.find_processed({linked_url2, opts1[:scope]})
267
+ assert Store.find_processed({linked_url2, opts2[:scope]})
268
+
269
+ assert Store.ops_count() >= 6
270
+ assert Store.ops_count() <= 10
271
+
272
+ assert OPQ.info(opts1[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 1}
273
+ assert OPQ.info(opts2[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
274
+ end)
275
+ end
276
+
277
+ test ".crawl stopped", %{bypass: bypass, url: url} do
278
+ url = "#{url}/stop"
279
+ linked_url = "#{url}/stop1"
280
+
281
+ Bypass.expect_once(bypass, "GET", "/stop", fn conn ->
282
+ Plug.Conn.resp(conn, 200, """
283
+ <html><a href="#{linked_url}">1</a></html>
284
+ """)
285
+ end)
286
+
287
+ {:ok, opts} = Crawler.crawl(url, workers: 1, interval: 500)
288
+
289
+ Process.sleep(200)
290
+
291
+ Crawler.stop(opts)
292
+
293
+ refute Store.find({linked_url, nil})
294
+ end
295
+ end
@@ -0,0 +1,24 @@
1
+ defmodule Crawler.TestCase do
2
+ use ExUnit.CaseTemplate
3
+
4
+ using do
5
+ quote do
6
+ import Crawler.TestHelpers
7
+ end
8
+ end
9
+
10
+ setup_all do
11
+ bypass = Bypass.open()
12
+ url = "http://localhost:#{bypass.port}"
13
+ path = "localhost-#{bypass.port}"
14
+
15
+ bypass2 = Bypass.open()
16
+ url2 = "http://localhost:#{bypass2.port}"
17
+ path2 = "localhost-#{bypass2.port}"
18
+
19
+ {
20
+ :ok,
21
+ bypass: bypass, url: url, path: path, bypass2: bypass2, url2: url2, path2: path2
22
+ }
23
+ end
24
+ end
@@ -0,0 +1,28 @@
1
+ # Credit: https://gist.github.com/cblavier/5e15791387a6e22b98d8
2
+ defmodule Crawler.TestHelpers do
3
+ def wait(fun), do: wait(500, fun)
4
+ def wait(0, fun), do: fun.()
5
+
6
+ def wait(timeout, fun) do
7
+ try do
8
+ fun.()
9
+ rescue
10
+ _ ->
11
+ :timer.sleep(10)
12
+ wait(max(0, timeout - 10), fun)
13
+ end
14
+ end
15
+
16
+ def tmp(path \\ "", filename \\ "") do
17
+ tmp_path = Path.join([File.cwd!(), "test", "tmp", path])
18
+
19
+ File.mkdir_p(tmp_path)
20
+
21
+ Path.join(tmp_path, filename)
22
+ end
23
+
24
+ def image_file do
25
+ {:ok, file} = File.read("test/fixtures/introducing-elixir.jpg")
26
+ file
27
+ end
28
+ end
@@ -0,0 +1,7 @@
1
+ [File.cwd!(), "test", "tmp", "*"]
2
+ |> Path.join()
3
+ |> Path.wildcard()
4
+ |> Enum.each(&File.rm_rf/1)
5
+
6
+ ExUnit.start()
7
+ Application.ensure_all_started(:bypass)
package/grell/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
@@ -0,0 +1,28 @@
1
+ language: ruby
2
+ cache: bundler
3
+
4
+ rvm:
5
+ - 2.2.4
6
+ - 2.3.0
7
+ - 2.4.2
8
+
9
+ before_install:
10
+ - mkdir travis-phantomjs
11
+ - wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true
12
+ -O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
13
+ - tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
14
+ - export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
15
+
16
+ install:
17
+ - bundle install --jobs=3 --retry=3
18
+
19
+ script:
20
+ - bundle exec rspec
21
+
22
+ deploy:
23
+ provider: rubygems
24
+ api_key:
25
+ secure: czStDI0W6MWL70sDwu53oNNCc8vKtT61pgvii+ZWIC9A41C2p7BzmbtosXsnLk2ApxmpWvFIgtQE0XIH7jkM5mY05cHinXDphtOTkNLFVjck3ZOMkx/cc+QRFW8K4FHkrzFsC+/Xx4t2/Psh35LpzhfJd0XzKKoCstXUVgJsfGcAK3DMpjXHSUbwLXGDZ4lzmsk52OLf0oL+in2447TJfVOvGXtYmfh1PjXRwDxKB0dan7w5mVgajS52b6wUhVPTaMe/JgCbMuV7BaQ1Goq8u7V4aaxU+liPAhzHWfMB6tF4TEW8yu2tvGLdOA0+1jmM8E9Q5saPWtwKiHvBxN8CzRpkiNDzyFAf8ljrWT5yKX3aRQCyPp3NNyhoumWap36b+O/zwZ3HxoAe22Yg0rjz8z8NxMR/ELPvjPYjCiF5zY7fO9PAzmIynMRUrxDnFj+/JGHdzx0ZMo3fEXgHHSaHPNxIzEffVVQk4XLVnFHDjBLY4mVp4sbHbja5qnui20RkdM/H9Yi/fQyl1ODhk+LUPoh45ZneDZq7GPrl+WKK06oEjXIXLU+1iEuqnSqybbmJMTUJlUV+7EJdtq2DgfDB4KXwLm2LLOR/IX63AzEav4NIxx3hIXifSKa9rp6D7nMTzdQwF0FFzIj/Y3qLrAe1WWt0gx3Vxq67pSwOJthk5Fc=
26
+ on:
27
+ tags: true
28
+ rvm: 2.4.2
@@ -0,0 +1,111 @@
1
+ # 2.1.2
2
+ * Change white/black lists to allow/deny lists
3
+
4
+ # 2.1.1
5
+ * Update phantomjs_options to use 'TLSv1.2'
6
+
7
+ # 2.1.0
8
+ * Delete `driver_options` configuration key as it was never used.
9
+ * `cleanup_all_processes` is a self method as intended to.
10
+
11
+ # 2.0.0
12
+ * New configuration key `on_periodic_restart`.
13
+ * CrawlerManager.cleanup_all_processes method destroy all instances of phantomjs in this machine.
14
+
15
+ * Breaking changes
16
+ - Requires Ruby 2.1 or later.
17
+ - Crawler.start_crawling does not accept options anymore, all options are passed to Crawler.new.
18
+ - Crawler's methods `restart` and `quit` have been moved to CrawlerManager.
19
+ - Crawler gets whitelist and blacklist as configuration options instead of being set in specific methods.
20
+
21
+ # 1.6.11
22
+ * Ensure all links are loaded by waiting for Ajax requests to complete
23
+ * Add '@evaluate_in_each_page' option to evaluate before extracting links (e.g. $('.dropdown').addClass('open');)
24
+
25
+ # 1.6.10
26
+ * Avoid following JS href links, add missing dependencies to fix Travis build
27
+
28
+ # 1.6.9
29
+ * Avoid following links when disabled by CSS (1.6.8 worked only for Javascript)
30
+
31
+ # 1.6.8
32
+ * Avoid following disabled links
33
+
34
+ # 1.6.7
35
+ * Increment '@times_visited' first to avoid infinite retries when rescuing errors
36
+
37
+ # 1.6.6
38
+ * Updated phantomjs_logger not to open '/dev/null'
39
+
40
+ # 1.6.5
41
+ * Added #quit to Crawler
42
+
43
+ # 1.6.4
44
+ * Added #quit to Capybara driver
45
+
46
+ # 1.6.3
47
+ * Only follow visible links
48
+
49
+ # 1.6.2
50
+ * Reset Capybara driver to Puffing Billy (used to rewrite URL requests in specs)
51
+ * Use float timestamp for Poltergeist driver name to support fast test executions
52
+
53
+ # 1.6.1
54
+ * Use non-static name to support registering Poltergeist crawler multiple times
55
+ * More exception handling, store redirected URLs in addition to original URL
56
+
57
+ # 1.6
58
+ * Support custom URL comparison when adding new pages during crawling
59
+ * Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
60
+ * Fail early if Capybara doesn't initialize properly
61
+
62
+ # 1.5.1
63
+ * Fixed deprecation warning (Thanks scott)
64
+ * Updated Poltergeist dependency
65
+
66
+ # 1.5.0
67
+ * Grell will follow redirects.
68
+ * Added #followed_redirects? #error? #current_url methods to the Page class
69
+
70
+ # 1.4.0
71
+ * Added crawler.restart to restart browser process
72
+ * The block of code can make grell retry any given page.
73
+
74
+ # 1.3.2
75
+ * Rescue Timeout error and return an empty page when that happens
76
+
77
+ # 1.3.1
78
+ * Added whitelisting and blacklisting
79
+ * Better info in gemspec
80
+
81
+ # 1.3
82
+ * The Crawler object allows you to provide an external logger object.
83
+ * Clearer semantics when an error happens, special headers are returned so the user can inspect the error
84
+ * Caveats:
85
+ - The 'debug' option in the crawler does not have any affect anymore. Provide an external logger with 'logger' instead
86
+ - The errors provided in the headers by grell has changed from 'grell_status' to 'grellStatus'.
87
+ - The 'visited' property in the page was never supposed to be accesible. Use 'visited?' instead.
88
+
89
+ # 1.2.1
90
+ * Solve bug: URLs are case insensitive
91
+
92
+ # 1.2
93
+ * Grell now will consider two links to point to the same page only when the whole URL is exactly the same.
94
+ Versions previously would only consider two links to be the same when they shared the path.
95
+
96
+ # 1.1.2
97
+ * Solve bug where we were adding links in heads as if there were normal links in the body
98
+
99
+ # 1.1.1
100
+ * Solve bug with the new data-href functionality
101
+
102
+ # 1.1
103
+ * Solve problem with randomly failing spec
104
+ * Search for elements with 'href' or 'data-href' to find links
105
+
106
+ # 1.0.1
107
+ * Rescueing Javascript errors
108
+
109
+ # 1.0
110
+ * Initial implementation
111
+ * Basic support to crawling pages.
package/grell/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Avoid ruby 2.1 to use Rack > 2.0 which is not compatible
4
+ platform :ruby_21 do
5
+ gem 'rack', '~> 1.0'
6
+ end
7
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Medidata Solutions Worldwide
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.