powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,4 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <!-- Do not edit this file with editors other than draw.io -->
3
+ <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
4
+ <svg xmlns="http://www.w3.org/2000/svg" style="background-color: rgb(255, 255, 255);" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="1038px" height="1102px" viewBox="-0.5 -0.5 1038 1102" content="&lt;mxfile host=&quot;app.diagrams.net&quot; modified=&quot;2023-09-24T13:07:23.863Z&quot; agent=&quot;Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/117.0&quot; version=&quot;21.8.2&quot; etag=&quot;lZLU4BOclDeRGBZxtJZK&quot; type=&quot;device&quot;&gt;&lt;diagram id=&quot;17baaa38-e2c7-842e-4007-38427922787c&quot; name=&quot;Page-1&quot;&gt;7Vvfc+I2EP5rmGkfytiWLcNjQi65h3SaHtPp3aOwBWhiLGqLEPrXV7Ylg35AHLA5J9M8ELyyZenb/XZXKzEAk9XrQ4bWy99pjJOB58SvA3A38DzXD33+r5DshMT1xpVkkZFYyPaCKfkXC6EjpBsS41y5kVGaMLJWhRFNUxwxRYayjG7V2+Y0Ud+6RgtsCKYRSkzp3yRmy0o68uBe/hWTxVK+2YVifjMUPS8yuknF+wYemJd/VfMKyb7ERPMliun2QAS+DMAko5RV31avE5wU4ErYqufuj7TW485wypo8AKoHXlCyEVP/69ujGBrbSTjKCeHiEWcAbrdLwvB0jaKidcstgMuWbJXwK5d/ndOUTcWz5TVJkglNaFb2VcDhRRGX5yyjz/igJYYzGEDeYk5CzOsFZwy/HojEpB4wXWGW7fgtohVAAbCwwFBcbvfadKUOlgeaBEKGhAEt6p73IPIvAkc7pr6B6RPKUJKUJLnHLFqeANhtALAKaIzwaG4FFEYjPJtbVFLgSLi13yRkkXIZo+uWYPdV2Gs7P8QdWnCHLeAe2nDP8ecF21PBBo4JtudYwK7d7CVojwy0DaBxGt8UzphfRQnKcxKp2PKpZ7vvhVcZBvLyh3Ay+JWwgyZ+9UM8dRQ5hrIFZoot4Fjx8yaUB1AFFqSkLMMJYuRFjQ429MQbnijhg6s15Wua4pFE7SKnmyzC4qlDd6135Kod+aHWUYWB0VGpzHrajfQr33Sg4MkmZ3TFZSWvslaJhd04wKGNWGMYAgQNYl3OHxe+zR9rkGiFP673LgKlNOXC2xjlyxphK5UUItXk+S7vE0SqWnJuLsykaCm+J4ns/CjWldmq9tJfFta+UJJHV2NTFtYDkoajR68WWRgYVvJI0ucikSAJ+/gc1PM0AK6YqLnwWMbgVCD/zMShA2zD4IrY2rKxRQHtHWLos0Hr2/LczqDtX+qlBILADASSaT2JBCNVe9A5MxC4QO0oCLoLBGND6Q84nbKSUzDh+NzOeDSAi+Lbnxu8Mdc9+RKti6/RLiGcbNnbRJtVrHyc9Yh5YxVy18I82wJz1ALx5KsuIt5Rcp2iZCPiAQvvxn3mXdAW7/zueFdbaE+d7dhUut9nnUOvLV8bdqfz9y3NTvLcXHTV1uAOjKVaM53Dj0b0qyudqwbtDm5bFzfkx8cL9RXd2Dk5Lv1+WUrY21w1grMt0KzLV2mzM2U06ya0ny7bNwj1R423eXQPNFx9aC5ZOovuZt2+E0+veoTQgeL6CWeED7pQXvOYH5quQNpOT1yBryVsvh+04wuMrL0lXxCM9AGP2uW2WdK5I7lZbfisnLaVWTvjtFnh6SKSW4quSqG2EZV9k8qg36mcrqGzmaxXRVpM5cwy1DeB2ZRliOHFzjCIj1ZCDVU0a20f1qKs24AtEAyYiYqBZ3EeYn3+xOtTHmgme3ROAqLv69j2/l0LHm4b2zrAtnao6jPF/BVk4D8bKht+y8tDMDf8BtdZv+4bZU3nES+4p5J98WFU3VWNJ2y4wYmNpjvUtbLfts3rYG3ZoszQNim2RZz7TRoxQlOUEHaK401OtFxU9XIuAlK0hsNRELq+V39qlB+CkQ/dQH4Cw+LH/tBTujB1AsBQ6SRoQUNm9aTaRCZ5yeYu1fROV3wFNcFwCMfcasPqM+yLksyFzx/zOU86LZXle1Jq7StK46Tl3cefUEwOtESkThfeCp36GuYs2M11gIHnW7mpkmV2WGYEluRUrmN6kpzqUR/6Z2annnbGJhh3lp0Cc3kyoWm+WZUR7Ga9TnhYLryjScR2kopftoSrxnNWm+JsbUntLdrxxbFD54MizdJfG5XDI+mCt+5jbYwY+rWT1ET3EQEexb7NR4y8GYCdpNfakcbAt6STXR1pBOby5UIf0WD76dyytOUAEOjXvq+xMtAXnuf6CGMp3FItytMODsr6d1u1KNDCUYJrxSBLqbPn5gX13YqzzUtfqrcYgsxzBS1ZgGV760ILsO1w/28C7Va72/YwMloe2Nc04mH+8x1dtO1Vdfcbk54fS7A4a2kJPWGqr639wLlMNTrS1+RnO2t+uf8lWHX7/vd24Mt/&lt;/diagram&gt;&lt;/mxfile&gt;"><defs/><g><rect x="420" y="20" width="240" height="60" fill="#fff2cc" stroke="#d6b656" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 25px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">URL</div></div></div></foreignObject><text x="270" y="29" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">URL</text></switch></g><rect x="380" y="320" width="320" height="120" rx="18" ry="18" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe center; width: 158px; height: 1px; padding-top: 167px; margin-left: 191px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Parallel Fetch</div></div></div></foreignObject><text x="270" y="179" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Parallel Fetch</text></switch></g><rect x="340" y="480" width="400" height="420" rx="60" ry="60" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe center; width: 198px; height: 1px; padding-top: 247px; margin-left: 171px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Parse</div></div></div></foreignObject><text x="270" y="259" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Parse</text></switch></g><path d="M 540 444 L 540 467.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 477.76 L 533 463.76 L 540 467.26 L 547 463.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="20" y="480" width="240" height="420" rx="36" ry="36" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 345px; margin-left: 11px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Custom Parser</div></div></div></foreignObject><text x="70" y="349" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Custom Parser</text></switch></g><path d="M 272.74 690 L 340 690" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 262.24 690 L 276.24 683 L 272.74 690 L 276.24 697 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="420" y="540" width="240" height="60" rx="9" ry="9" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 285px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Link Filter</div></div></div></foreignObject><text x="270" y="289" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Link Filter</text></switch></g><rect x="420" y="630" width="240" height="60" rx="9" ry="9" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 330px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Parse Link</div></div></div></foreignObject><text x="270" y="334" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Parse Link</text></switch></g><rect x="420" y="720" width="240" height="60" rx="9" ry="9" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 375px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Page Data</div></div></div></foreignObject><text x="270" y="379" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Page Data</text></switch></g><path d="M 540 600 L 540 617.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 627.76 L 533 613.76 L 540 617.26 L 547 613.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 480 152 C 480 109.33 600 109.33 600 152 L 600 248 C 600 290.67 480 290.67 480 248 Z" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 480 152 C 480 184 600 184 600 152" fill="none" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 112px; margin-left: 241px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">GenStage<br />Queue</div></div></div></foreignObject><text x="270" y="116" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">GenStage...</text></switch></g><path d="M 540 80 L 540 107.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 117.76 L 533 103.76 L 540 107.26 L 547 103.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 540 280 L 540 307.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 317.76 L 533 303.76 L 540 307.26 L 547 303.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 660 660 L 980 660 Q 1000 660 1000 640 L 1000 220 Q 1000 200 980 200 L 612.74 200" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 602.24 200 L 616.24 193 L 612.74 200 L 616.24 207 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 842 C 800 799.33 920 799.33 920 842 L 920 938 C 920 980.67 800 980.67 800 938 Z" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 842 C 800 874 920 874 920 842" fill="none" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 457px; margin-left: 401px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Data Store</div></div></div></foreignObject><text x="430" y="461" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Data Store</text></switch></g><path d="M 740 776.52 L 840 776.09 Q 860 776 860 786.63 L 860 797.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 860 807.76 L 853 793.76 L 860 797.26 L 867 793.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 512 C 800 469.33 920 469.33 920 512 L 920 608 C 920 650.67 800 650.67 800 608 Z" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 512 C 800 544 920 544 920 512" fill="none" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 292px; margin-left: 401px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Disk</div></div></div></foreignObject><text x="430" y="296" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Disk</text></switch></g><path d="M 700 380 L 747.27 379.21" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 757.76 379.04 L 743.88 386.27 L 747.27 379.21 L 743.65 372.27 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="440" y="380" width="200" height="40" rx="6" ry="6" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 98px; height: 1px; padding-top: 200px; margin-left: 221px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Retry Strategy</div></div></div></foreignObject><text x="270" y="204" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Retry Strategy</text></switch></g><rect x="20" y="20" width="220" height="220" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe center; width: 108px; height: 1px; padding-top: 17px; margin-left: 11px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><font style="font-size: 10px">Legend</font></div></div></div></foreignObject><text x="65" y="29" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Legend</text></switch></g><rect x="35.71" y="70.77" width="188.57" height="67.69" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 92px; height: 1px; padding-top: 52px; margin-left: 19px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Crawler Functionality</div></div></div></foreignObject><text x="65" y="56" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Crawler Function...</text></switch></g><rect x="35.71" y="155.38" width="188.57" height="67.69" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 92px; height: 1px; padding-top: 95px; margin-left: 19px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Customisable Functionality</div></div></div></foreignObject><text x="65" y="98" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Customisable Fun...</text></switch></g><rect x="760" y="334" width="200" height="90" rx="13.5" ry="13.5" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 98px; height: 1px; padding-top: 190px; margin-left: 381px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Offline<br />File Handler</div></div></div></foreignObject><text x="430" y="193" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Offline...</text></switch></g><path d="M 860 424 L 860 467.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 860 477.76 L 853 463.76 L 860 467.26 L 867 463.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="380" y="960" width="320" height="120" fill="#d5e8d4" stroke="#82b366" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 158px; height: 1px; padding-top: 510px; margin-left: 191px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Consumer Application<br /><font style="font-size: 10px">(with multiple ways of <br />consuming Crawler data)</font></div></div></div></foreignObject><text x="270" y="514" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Consumer Application...</text></switch></g><path d="M 140 900 L 140 1000 Q 140 1020 160 1020 L 367.26 1020" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 377.76 1020 L 363.76 1027 L 367.26 1020 L 363.76 1013 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 540 900 L 540 947.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 540 957.76 L 533 943.76 L 540 947.26 L 547 943.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 860 970 L 860 1000 Q 860 1020 840 1020 L 712.74 1020" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 702.24 1020 L 716.24 1013 L 712.74 1020 L 716.24 1027 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="420" y="810" width="240" height="60" rx="9" ry="9" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 420px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Scraper</div></div></div></foreignObject><text x="270" y="424" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Scraper</text></switch></g><path d="M 540 780 L 540 797.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 807.76 L 533 793.76 L 540 797.26 L 547 793.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.drawio.com/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>
@@ -0,0 +1,9 @@
1
+ import Config
2
+
3
+ config :logger,
4
+ backends: [:console],
5
+ compile_time_purge_matching: [[level_lower_than: :info]]
6
+
7
+ if File.exists?("config/#{Mix.env()}.exs") do
8
+ import_config("#{Mix.env()}.exs")
9
+ end
@@ -0,0 +1,5 @@
1
+ import Config
2
+
3
+ config :logger,
4
+ backends: [:console],
5
+ compile_time_purge_matching: [[level_lower_than: :debug]]
@@ -0,0 +1,5 @@
1
+ import Config
2
+
3
+ config :logger,
4
+ backends: [:console],
5
+ compile_time_purge_matching: [[level_lower_than: :debug]]
@@ -0,0 +1,37 @@
1
+ defmodule Crawler.Example.GoogleSearch.Scraper do
2
+ @moduledoc """
3
+ We only scrape Github pages, specifically looking for a project's name and description.
4
+ """
5
+
6
+ @behaviour Crawler.Scraper.Spec
7
+
8
+ alias Crawler.Store.Page
9
+ alias Crawler.Example.GoogleSearch.Data
10
+
11
+ def scrape(%Page{url: "https://github.com" <> _ = url, body: body, opts: _opts} = page) do
12
+ doc =
13
+ body
14
+ |> Floki.parse_document!()
15
+
16
+ name =
17
+ doc
18
+ |> Floki.find("#repository-container-header strong a")
19
+ |> Floki.text()
20
+
21
+ desc =
22
+ doc
23
+ |> Floki.find(".Layout-sidebar p.f4")
24
+ |> Floki.text()
25
+ |> String.trim()
26
+
27
+ if name != "" do
28
+ Agent.update(Data, fn state ->
29
+ Map.merge(state, %{name => %{url: url, desc: desc}})
30
+ end)
31
+ end
32
+
33
+ {:ok, page}
34
+ end
35
+
36
+ def scrape(page), do: {:ok, page}
37
+ end
@@ -0,0 +1,11 @@
1
+ defmodule Crawler.Example.GoogleSearch.UrlFilter do
2
+ @moduledoc """
3
+ We start with Google, then only crawls Github.
4
+ """
5
+
6
+ @behaviour Crawler.Fetcher.UrlFilter.Spec
7
+
8
+ def filter("https://www.google.com" <> _, _opts), do: {:ok, true}
9
+ def filter("https://github.com" <> _, _opts), do: {:ok, true}
10
+ def filter(_url, _opts), do: {:ok, false}
11
+ end
@@ -0,0 +1,77 @@
1
+ defmodule Crawler.Example.GoogleSearch do
2
+ @moduledoc """
3
+ This example performs a Google search, then scrapes the results to find Github
4
+ projects and output their name and description.
5
+
6
+ Example output:
7
+
8
+ Agent.get(Data, & &1) #=> %{
9
+ "crawler" => %{
10
+ desc: "A high performance web crawler / scraper in Elixir.",
11
+ url: "https://github.com/fredwu/crawler"
12
+ },
13
+ "crawly" => %{
14
+ desc: "Crawly, a high-level web crawling & scraping framework for Elixir.",
15
+ url: "https://github.com/elixir-crawly/crawly"
16
+ },
17
+ "elixir_scraper" => %{
18
+ desc: "Elixir/Hound web scraper example",
19
+ url: "https://github.com/jaydorsey/elixir_scraper"
20
+ },
21
+ "mechanize" => %{
22
+ desc: "Build web scrapers and automate interaction with websites in Elixir with ease!",
23
+ url: "https://github.com/gushonorato/mechanize"
24
+ }
25
+ }
26
+ """
27
+
28
+ alias Crawler.Example.GoogleSearch.Data
29
+ alias Crawler.Example.GoogleSearch.Scraper
30
+ alias Crawler.Example.GoogleSearch.UrlFilter
31
+
32
+ @site_url "https://www.google.com/search?"
33
+ @search_term "github web scrapers in Elixir"
34
+
35
+ def run do
36
+ Agent.start_link(fn -> %{} end, name: Data)
37
+
38
+ # Do not crawl Google too fast, or you will get blocked
39
+
40
+ {:ok, opts} =
41
+ Crawler.crawl(
42
+ search_url(),
43
+ workers: 2,
44
+ max_depths: 2,
45
+ max_pages: 10,
46
+ interval: 80,
47
+ user_agent: "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0",
48
+ scraper: Scraper,
49
+ url_filter: UrlFilter
50
+ )
51
+
52
+ wait(fn ->
53
+ false = Crawler.running?(opts)
54
+
55
+ # give the scraper time to finish
56
+ Process.sleep(2_000)
57
+
58
+ dbg(Agent.get(Data, & &1))
59
+ end)
60
+ end
61
+
62
+ defp search_url do
63
+ @site_url <> URI.encode_query(%{"q" => @search_term})
64
+ end
65
+
66
+ defp wait(fun), do: wait(5_000, fun)
67
+
68
+ defp wait(timeout, fun) do
69
+ try do
70
+ fun.()
71
+ rescue
72
+ _ ->
73
+ :timer.sleep(500)
74
+ wait(max(0, timeout - 500), fun)
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,14 @@
1
+ defmodule Crawler.Dispatcher.Worker do
2
+ @moduledoc """
3
+ A worker that performs the crawling.
4
+ """
5
+
6
+ @doc """
7
+ Kicks off `Crawler.crawl_now/1`.
8
+ """
9
+ def start_link(opts) do
10
+ Task.start_link(fn ->
11
+ Crawler.crawl_now(opts)
12
+ end)
13
+ end
14
+ end
@@ -0,0 +1,20 @@
1
+ defmodule Crawler.Dispatcher do
2
+ @moduledoc """
3
+ Dispatches requests to a queue for crawling.
4
+ """
5
+
6
+ @doc """
7
+ Takes the `request` argument which is a tuple containing either:
8
+
9
+ - `{_, link, _, url}` when it's a link that got transformed into a URL
10
+ - `{_, url}` when it's a URL already
11
+
12
+ And issues `Crawler.crawl/2` to initiate the crawl.
13
+ """
14
+ def dispatch(request, opts) do
15
+ case request do
16
+ {_, _link, _, url} -> Crawler.crawl(url, opts)
17
+ {_, url} -> Crawler.crawl(url, opts)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,60 @@
1
+ defmodule Crawler.Fetcher.HeaderPreparer do
2
+ @moduledoc """
3
+ Captures and prepares HTTP response headers.
4
+ """
5
+
6
+ @default_content_type "text/html"
7
+
8
+ @doc """
9
+ Captures and prepares HTTP response headers.
10
+
11
+ ## Examples
12
+
13
+ iex> HeaderPreparer.prepare(
14
+ iex> [{"Content-Type", "text/html"}],
15
+ iex> %{}
16
+ iex> )
17
+ %{headers: [{"Content-Type", "text/html"}], content_type: "text/html"}
18
+
19
+ iex> HeaderPreparer.prepare(
20
+ iex> [{"Content-Type", "text/css"}],
21
+ iex> %{}
22
+ iex> )
23
+ %{headers: [{"Content-Type", "text/css"}], content_type: "text/css"}
24
+
25
+ iex> HeaderPreparer.prepare(
26
+ iex> [{"Content-Type", "image/png; blah"}],
27
+ iex> %{}
28
+ iex> )
29
+ %{headers: [{"Content-Type", "image/png; blah"}], content_type: "image/png"}
30
+ """
31
+ def prepare(headers, opts) do
32
+ content_type =
33
+ headers
34
+ |> get_content_type()
35
+ |> simplify_content_type()
36
+
37
+ opts
38
+ |> Map.put(:headers, headers)
39
+ |> Map.put(:content_type, content_type)
40
+ end
41
+
42
+ defp get_content_type(nil), do: @default_content_type
43
+
44
+ defp get_content_type(headers) do
45
+ case Enum.find(headers, &find_content_type/1) do
46
+ {_, value} -> value
47
+ _ -> @default_content_type
48
+ end
49
+ end
50
+
51
+ defp find_content_type({header, _}) do
52
+ String.downcase(header) == "content-type"
53
+ end
54
+
55
+ defp simplify_content_type(content_type) do
56
+ content_type
57
+ |> String.split(";", parts: 2)
58
+ |> Kernel.hd()
59
+ end
60
+ end
@@ -0,0 +1,45 @@
1
+ defmodule Crawler.Fetcher.Modifier do
2
+ @moduledoc """
3
+ Modifies request options and headers before dispatch.
4
+ """
5
+
6
+ defmodule Spec do
7
+ @type url :: String.t()
8
+ @type header :: {String.t(), String.t()}
9
+ @type opts :: map
10
+
11
+ @callback headers(opts) :: list(header) | []
12
+ @callback opts(opts) :: keyword | []
13
+ end
14
+
15
+ @behaviour __MODULE__.Spec
16
+
17
+ @doc """
18
+ Allows modifying headers prior to making a crawl request.
19
+
20
+ ## Example implementation
21
+
22
+ def headers(opts) do
23
+ if opts[:url] == "http://modifier" do
24
+ [{"Referer", "http://fetcher"}]
25
+ end
26
+ []
27
+ end
28
+ """
29
+ def headers(_opts), do: []
30
+
31
+ @doc """
32
+ Allows passing opts to httpPoison prior to making the crawl request
33
+
34
+ ## Example implementation
35
+
36
+ def opts(opts) do
37
+ if opts[:url] == "http://modifier" do
38
+ # add a new pool to hackney
39
+ [hackney: [pool: :modifier]]
40
+ end
41
+ []
42
+ end
43
+ """
44
+ def opts(_opts), do: []
45
+ end
@@ -0,0 +1,77 @@
1
+ defmodule Crawler.Fetcher.Policer do
2
+ @moduledoc """
3
+ Checks a series of conditions to determine whether it is okay to continue.
4
+ """
5
+
6
+ require Logger
7
+
8
+ alias Crawler.Store
9
+
10
+ @uri_schemes ["http", "https"]
11
+ @asset_extra_depth 2
12
+
13
+ @doc """
14
+ Checks a series of conditions to determine whether it is okay to continue,
15
+ i.e. to allow `Crawler.Fetcher.fetch/1` to begin its tasks.
16
+ """
17
+ def police(opts) do
18
+ with {_, true} <- within_max_pages?(opts),
19
+ {_, true} <- within_fetch_depth?(opts),
20
+ {_, true} <- acceptable_uri_scheme?(opts),
21
+ {_, true} <- not_fetched_yet?(opts),
22
+ {_, true} <- perform_url_filtering(opts) do
23
+ {:ok, opts}
24
+ else
25
+ {fail_type, _} -> police_warn(fail_type, opts)
26
+ end
27
+ end
28
+
29
+ defp within_max_pages?(%{max_pages: :infinity} = _opts), do: {:within_max_pages?, true}
30
+
31
+ defp within_max_pages?(%{max_pages: max_pages} = _opts) when is_integer(max_pages) do
32
+ {:within_max_pages?, Store.ops_count() < max_pages}
33
+ end
34
+
35
+ defp within_max_pages?(_opts), do: {:within_max_pages?, true}
36
+
37
+ defp within_fetch_depth?(%{depth: depth, max_depths: max_depths} = opts) do
38
+ max_depths =
39
+ case opts[:html_tag] do
40
+ "a" -> max_depths
41
+ _ -> max_depths + @asset_extra_depth
42
+ end
43
+
44
+ {:within_fetch_depth?, depth < max_depths}
45
+ end
46
+
47
+ defp within_fetch_depth?(_opts), do: {:within_fetch_depth?, true}
48
+
49
+ defp acceptable_uri_scheme?(%{url: url} = _opts) do
50
+ scheme =
51
+ url
52
+ |> String.split("://", parts: 2)
53
+ |> Kernel.hd()
54
+
55
+ {:acceptable_uri_scheme?, Enum.member?(@uri_schemes, scheme)}
56
+ end
57
+
58
+ defp acceptable_uri_scheme?(_opts), do: {:acceptable_uri_scheme?, true}
59
+
60
+ defp not_fetched_yet?(%{url: url, scope: scope} = _opts) do
61
+ {:not_fetched_yet?, !Store.find({url, scope})}
62
+ end
63
+
64
+ defp not_fetched_yet?(_opts), do: {:not_fetched_yet?, true}
65
+
66
+ defp perform_url_filtering(%{url_filter: url_filter, url: url} = opts) do
67
+ {:ok, pass_through?} = url_filter.filter(url, opts)
68
+
69
+ {:perform_url_filtering, pass_through?}
70
+ end
71
+
72
+ defp perform_url_filtering(_opts), do: {:perform_url_filtering, true}
73
+
74
+ defp police_warn(fail_type, opts) do
75
+ {:warn, "Fetch failed check '#{fail_type}', with opts: #{Kernel.inspect(opts)}."}
76
+ end
77
+ end
@@ -0,0 +1,55 @@
1
+ defmodule Crawler.Fetcher.Recorder do
2
+ @moduledoc """
3
+ Records information about each crawl for internal use.
4
+ """
5
+
6
+ alias Crawler.Store
7
+
8
+ @doc """
9
+ Records information about each crawl for internal use.
10
+
11
+ ## Examples
12
+
13
+ iex> Recorder.record(url: "url1", depth: 2)
14
+ {:ok, %{depth: 3, url: "url1"}}
15
+
16
+ iex> Recorder.record(url: "url2", depth: 2)
17
+ iex> Store.find({"url2", nil})
18
+ %Page{url: "url2"}
19
+ """
20
+ def record(opts) do
21
+ with opts <- Enum.into(opts, %{}),
22
+ {:ok, _pid} <- store_url(opts),
23
+ opts <- store_url_depth(opts) do
24
+ {:ok, opts}
25
+ end
26
+ end
27
+
28
+ @doc """
29
+ Stores page data in `Crawler.Store.DB` for internal or external consumption, if enabled.
30
+
31
+ ## Examples
32
+
33
+ iex> Recorder.maybe_store_page("body", %{store: nil})
34
+ {:ok, nil}
35
+
36
+ iex> Recorder.record(url: "url", depth: 2)
37
+ iex> Recorder.maybe_store_page("body", %{store: Store, url: "url", scope: nil})
38
+ {:ok, {%Page{url: "url", body: "body", opts: %{store: Store, url: "url", scope: nil}}, %Page{url: "url", body: nil}}}
39
+ """
40
+ def maybe_store_page(_body, %{store: nil} = _opts) do
41
+ {:ok, nil}
42
+ end
43
+
44
+ def maybe_store_page(body, opts) do
45
+ {:ok, opts[:store].add_page_data({opts[:url], opts[:scope]}, body, opts)}
46
+ end
47
+
48
+ defp store_url(opts) do
49
+ Store.add({opts[:url], opts[:scope]})
50
+ end
51
+
52
+ defp store_url_depth(opts) do
53
+ Map.replace!(opts, :depth, opts[:depth] + 1)
54
+ end
55
+ end
@@ -0,0 +1,32 @@
1
+ defmodule Crawler.Fetcher.Requester do
2
+ @moduledoc """
3
+ Makes HTTP requests.
4
+ """
5
+
6
+ alias Crawler.HTTP
7
+
8
+ @fetch_opts [
9
+ follow_redirect: true,
10
+ max_redirect: 5
11
+ ]
12
+
13
+ @doc """
14
+ Makes HTTP requests via `Crawler.HTTP`.
15
+
16
+ ## Examples
17
+
18
+ iex> Requester.make(url: "fake.url", modifier: Crawler.Fetcher.Modifier)
19
+ {:error, %HTTPoison.Error{id: nil, reason: :nxdomain}}
20
+ """
21
+ def make(opts) do
22
+ HTTP.get(opts[:url], fetch_headers(opts), fetch_opts(opts))
23
+ end
24
+
25
+ defp fetch_headers(opts) do
26
+ [{"User-Agent", opts[:user_agent]}] ++ opts[:modifier].headers(opts)
27
+ end
28
+
29
+ defp fetch_opts(opts) do
30
+ @fetch_opts ++ [recv_timeout: opts[:timeout]] ++ opts[:modifier].opts(opts)
31
+ end
32
+ end
@@ -0,0 +1,43 @@
1
+ defmodule Crawler.Fetcher.Retrier do
2
+ @moduledoc """
3
+ Handles retries for failed crawls.
4
+ """
5
+
6
+ defmodule Spec do
7
+ @moduledoc """
8
+ Spec for defining a fetch retrier.
9
+ """
10
+
11
+ @type fetch_url :: fun
12
+ @type opts :: map
13
+
14
+ @callback perform(fetch_url, opts) :: term
15
+ end
16
+
17
+ use Retry
18
+
19
+ @behaviour __MODULE__.Spec
20
+
21
+ @doc """
22
+ More information: [https://github.com/safwank/ElixirRetry](https://github.com/safwank/ElixirRetry)
23
+ """
24
+ def perform(fetch_url, opts) do
25
+ retry with:
26
+ exponential_backoff()
27
+ |> expiry(timeout_value(opts[:timeout]))
28
+ |> Stream.take(opts[:retries]) do
29
+ fetch_url.()
30
+ after
31
+ result -> result
32
+ else
33
+ error -> error
34
+ end
35
+ end
36
+
37
+ defp timeout_value(value) do
38
+ case Kernel.is_integer(value) do
39
+ true -> value
40
+ false -> 5_000
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,26 @@
1
+ defmodule Crawler.Fetcher.UrlFilter do
2
+ @moduledoc """
3
+ A placeholder module that lets all URLs pass through.
4
+ """
5
+
6
+ defmodule Spec do
7
+ @moduledoc """
8
+ Spec for defining an url filter.
9
+ """
10
+
11
+ @type url :: String.t()
12
+ @type opts :: map
13
+
14
+ @callback filter(url, opts) :: {:ok, boolean} | {:error, term}
15
+ end
16
+
17
+ @behaviour __MODULE__.Spec
18
+
19
+ @doc """
20
+ Whether to pass through a given URL.
21
+
22
+ - `true` for letting the url through
23
+ - `false` for rejecting the url
24
+ """
25
+ def filter(_url, _opts), do: {:ok, true}
26
+ end