powerdlz23 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Spider/README.md +19 -0
- package/Spider/domain.py +18 -0
- package/Spider/general.py +51 -0
- package/Spider/link_finder.py +25 -0
- package/Spider/main.py +50 -0
- package/Spider/spider.py +74 -0
- package/crawler/.formatter.exs +5 -0
- package/crawler/.github/workflows/ci.yml +29 -0
- package/crawler/.recode.exs +33 -0
- package/crawler/.tool-versions +2 -0
- package/crawler/CHANGELOG.md +82 -0
- package/crawler/README.md +198 -0
- package/crawler/architecture.svg +4 -0
- package/crawler/config/config.exs +9 -0
- package/crawler/config/dev.exs +5 -0
- package/crawler/config/test.exs +5 -0
- package/crawler/examples/google_search/scraper.ex +37 -0
- package/crawler/examples/google_search/url_filter.ex +11 -0
- package/crawler/examples/google_search.ex +77 -0
- package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
- package/crawler/lib/crawler/dispatcher.ex +20 -0
- package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
- package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
- package/crawler/lib/crawler/fetcher/policer.ex +77 -0
- package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
- package/crawler/lib/crawler/fetcher/requester.ex +32 -0
- package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
- package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
- package/crawler/lib/crawler/fetcher.ex +81 -0
- package/crawler/lib/crawler/http.ex +7 -0
- package/crawler/lib/crawler/linker/path_builder.ex +71 -0
- package/crawler/lib/crawler/linker/path_expander.ex +59 -0
- package/crawler/lib/crawler/linker/path_finder.ex +106 -0
- package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
- package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
- package/crawler/lib/crawler/linker.ex +173 -0
- package/crawler/lib/crawler/options.ex +127 -0
- package/crawler/lib/crawler/parser/css_parser.ex +37 -0
- package/crawler/lib/crawler/parser/guarder.ex +38 -0
- package/crawler/lib/crawler/parser/html_parser.ex +41 -0
- package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
- package/crawler/lib/crawler/parser/link_parser.ex +50 -0
- package/crawler/lib/crawler/parser.ex +122 -0
- package/crawler/lib/crawler/queue_handler.ex +45 -0
- package/crawler/lib/crawler/scraper.ex +28 -0
- package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
- package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
- package/crawler/lib/crawler/snapper.ex +82 -0
- package/crawler/lib/crawler/store/counter.ex +19 -0
- package/crawler/lib/crawler/store/page.ex +7 -0
- package/crawler/lib/crawler/store.ex +87 -0
- package/crawler/lib/crawler/worker.ex +62 -0
- package/crawler/lib/crawler.ex +91 -0
- package/crawler/mix.exs +78 -0
- package/crawler/mix.lock +40 -0
- package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
- package/crawler/test/integration_test.exs +135 -0
- package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
- package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
- package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
- package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
- package/crawler/test/lib/crawler/http_test.exs +47 -0
- package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
- package/crawler/test/lib/crawler/linker_test.exs +7 -0
- package/crawler/test/lib/crawler/options_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser_test.exs +8 -0
- package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
- package/crawler/test/lib/crawler/scraper_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper_test.exs +9 -0
- package/crawler/test/lib/crawler/worker_test.exs +5 -0
- package/crawler/test/lib/crawler_test.exs +295 -0
- package/crawler/test/support/test_case.ex +24 -0
- package/crawler/test/support/test_helpers.ex +28 -0
- package/crawler/test/test_helper.exs +7 -0
- package/package.json +1 -1
- package/pto/CryptoNoter/.gitattributes +2 -0
- package/pto/CryptoNoter/CryptoNight.md +444 -0
- package/pto/CryptoNoter/CryptoNight.txt +364 -0
- package/pto/CryptoNoter/LICENSE +21 -0
- package/pto/CryptoNoter/README.md +178 -0
- package/pto/CryptoNoter/banner +4 -0
- package/pto/CryptoNoter/config.json +8 -0
- package/pto/CryptoNoter/install.sh +60 -0
- package/pto/CryptoNoter/package-lock.json +33 -0
- package/pto/CryptoNoter/package.json +16 -0
- package/pto/CryptoNoter/server.js +225 -0
- package/pto/CryptoNoter/web/demo.html +81 -0
- package/pto/CryptoNoter/web/index.html +1 -0
- package/pto/CryptoNoter/web/lib/cryptonight-asmjs.min.js +16891 -0
- package/pto/CryptoNoter/web/lib/cryptonight-asmjs.min.js.mem +0 -0
- package/pto/CryptoNoter/web/lib/cryptonight.wasm +0 -0
- package/pto/CryptoNoter/web/processor.js +496 -0
- package/pto/CryptoNoter/web/worker.js +5549 -0
- package/pto/crypto/README.md +1 -0
- package/pto/crypto/aes256cbc/README.md +59 -0
- package/pto/crypto/aes256cbc/aes256cbc.go +172 -0
- package/pto/crypto/aes256cbc/aes256cbc_test.go +105 -0
- package/pto/crypto/aes256cbc/examples_test.go +30 -0
- package/pto/crypto/dh64/README.md +84 -0
- package/pto/crypto/dh64/c/dh64.c +75 -0
- package/pto/crypto/dh64/c/dh64.h +12 -0
- package/pto/crypto/dh64/c/dh64_test.c +30 -0
- package/pto/crypto/dh64/csharp/dh64.cs +77 -0
- package/pto/crypto/dh64/csharp/dh64_test.cs +1074 -0
- package/pto/crypto/dh64/go/dh64.go +72 -0
- package/pto/crypto/dh64/go/dh64_test.go +1064 -0
- package/pto/crypto/mt19937/README.md +30 -0
- package/pto/crypto/mt19937/c/mt19937-64.c +180 -0
- package/pto/crypto/mt19937/c/mt19937-64.h +96 -0
- package/pto/crypto/mt19937/c/mt19937-64.out.txt +401 -0
- package/pto/crypto/mt19937/c/mt19937-64test.c +78 -0
- package/pto/crypto/mt19937/csharp/mt19937.cs +139 -0
- package/pto/crypto/mt19937/csharp/mt19937_test.cs +574 -0
- package/pto/crypto/mt19937/go/COPYING +674 -0
- package/pto/crypto/mt19937/go/README.rst +103 -0
- package/pto/crypto/mt19937/go/doc.go +35 -0
- package/pto/crypto/mt19937/go/example.go +32 -0
- package/pto/crypto/mt19937/go/mt19937.go +149 -0
- package/pto/crypto/mt19937/go/mt19937_test.go +614 -0
- package/pto/crypto/rc4/README.md +14 -0
- package/pto/crypto/rc4/csharp/rc4.cs +119 -0
- package/pto/crypto/rc4/csharp/rc4_echo_client.cs +78 -0
- package/pto/crypto/rc4/go/rc4_echo_client.go +102 -0
- package/pto/crypto/rc4/go/rc4_echo_server.go +110 -0
- package/rubyretriever/.rspec +2 -0
- package/rubyretriever/.travis.yml +7 -0
- package/rubyretriever/Gemfile +3 -0
- package/rubyretriever/Gemfile.lock +64 -0
- package/rubyretriever/LICENSE +20 -0
- package/rubyretriever/Rakefile +7 -0
- package/rubyretriever/bin/rr +79 -0
- package/rubyretriever/lib/retriever/cli.rb +25 -0
- package/rubyretriever/lib/retriever/core_ext.rb +13 -0
- package/rubyretriever/lib/retriever/fetch.rb +268 -0
- package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
- package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
- package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
- package/rubyretriever/lib/retriever/link.rb +47 -0
- package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
- package/rubyretriever/lib/retriever/page.rb +104 -0
- package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
- package/rubyretriever/lib/retriever/target.rb +47 -0
- package/rubyretriever/lib/retriever/version.rb +4 -0
- package/rubyretriever/lib/retriever.rb +15 -0
- package/rubyretriever/readme.md +166 -0
- package/rubyretriever/rubyretriever.gemspec +41 -0
- package/rubyretriever/spec/link_spec.rb +77 -0
- package/rubyretriever/spec/page_spec.rb +94 -0
- package/rubyretriever/spec/retriever_spec.rb +84 -0
- package/rubyretriever/spec/spec_helper.rb +17 -0
- package/rubyretriever/spec/target_spec.rb +55 -0
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<!-- Do not edit this file with editors other than draw.io -->
|
|
3
|
+
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
|
4
|
+
<svg xmlns="http://www.w3.org/2000/svg" style="background-color: rgb(255, 255, 255);" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="1038px" height="1102px" viewBox="-0.5 -0.5 1038 1102" content="<mxfile host="app.diagrams.net" modified="2023-09-24T13:07:23.863Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/117.0" version="21.8.2" etag="lZLU4BOclDeRGBZxtJZK" type="device"><diagram id="17baaa38-e2c7-842e-4007-38427922787c" name="Page-1">7Vvfc+I2EP5rmGkfytiWLcNjQi65h3SaHtPp3aOwBWhiLGqLEPrXV7Ylg35AHLA5J9M8ELyyZenb/XZXKzEAk9XrQ4bWy99pjJOB58SvA3A38DzXD33+r5DshMT1xpVkkZFYyPaCKfkXC6EjpBsS41y5kVGaMLJWhRFNUxwxRYayjG7V2+Y0Ud+6RgtsCKYRSkzp3yRmy0o68uBe/hWTxVK+2YVifjMUPS8yuknF+wYemJd/VfMKyb7ERPMliun2QAS+DMAko5RV31avE5wU4ErYqufuj7TW485wypo8AKoHXlCyEVP/69ujGBrbSTjKCeHiEWcAbrdLwvB0jaKidcstgMuWbJXwK5d/ndOUTcWz5TVJkglNaFb2VcDhRRGX5yyjz/igJYYzGEDeYk5CzOsFZwy/HojEpB4wXWGW7fgtohVAAbCwwFBcbvfadKUOlgeaBEKGhAEt6p73IPIvAkc7pr6B6RPKUJKUJLnHLFqeANhtALAKaIzwaG4FFEYjPJtbVFLgSLi13yRkkXIZo+uWYPdV2Gs7P8QdWnCHLeAe2nDP8ecF21PBBo4JtudYwK7d7CVojwy0DaBxGt8UzphfRQnKcxKp2PKpZ7vvhVcZBvLyh3Ay+JWwgyZ+9UM8dRQ5hrIFZoot4Fjx8yaUB1AFFqSkLMMJYuRFjQ429MQbnijhg6s15Wua4pFE7SKnmyzC4qlDd6135Kod+aHWUYWB0VGpzHrajfQr33Sg4MkmZ3TFZSWvslaJhd04wKGNWGMYAgQNYl3OHxe+zR9rkGiFP673LgKlNOXC2xjlyxphK5UUItXk+S7vE0SqWnJuLsykaCm+J4ns/CjWldmq9tJfFta+UJJHV2NTFtYDkoajR68WWRgYVvJI0ucikSAJ+/gc1PM0AK6YqLnwWMbgVCD/zMShA2zD4IrY2rKxRQHtHWLos0Hr2/LczqDtX+qlBILADASSaT2JBCNVe9A5MxC4QO0oCLoLBGND6Q84nbKSUzDh+NzOeDSAi+Lbnxu8Mdc9+RKti6/RLiGcbNnbRJtVrHyc9Yh5YxVy18I82wJz1ALx5KsuIt5Rcp2iZCPiAQvvxn3mXdAW7/zueFdbaE+d7dhUut9nnUOvLV8bdqfz9y3NTvLcXHTV1uAOjKVaM53Dj0b0qyudqwbtDm5bFzfkx8cL9RXd2Dk5Lv1+WUrY21w1grMt0KzLV2mzM2U06ya0ny7bNwj1R423eXQPNFx9aC5ZOovuZt2+E0+veoTQgeL6CWeED7pQXvOYH5quQNpOT1yBryVsvh+04wuMrL0lXxCM9AGP2uW2WdK5I7lZbfisnLaVWTvjtFnh6SKSW4quSqG2EZV9k8qg36mcrqGzmaxXRVpM5cwy1DeB2ZRliOHFzjCIj1ZCDVU0a20f1qKs24AtEAyYiYqBZ3EeYn3+xOtTHmgme3ROAqLv69j2/l0LHm4b2zrAtnao6jPF/BVk4D8bKht+y8tDMDf8BtdZv+4bZU3nES+4p5J98WFU3VWNJ2y4wYmNpjvUtbLfts3rYG3ZoszQNim2RZz7TRoxQlOUEHaK401OtFxU9XIuAlK0hsNRELq+V39qlB+CkQ/dQH4Cw+LH/tBTujB1AsBQ6SRoQUNm9aTaRCZ5yeYu1fROV3wFNcFwCMfcasPqM+yLksyFzx/zOU86LZXle1Jq7StK46Tl3cefUEwOtESkThfeCp36GuYs2M11gIHnW7mpkmV2WGYEluRUrmN6kpzqUR/6Z2annnbGJhh3lp0Cc3kyoWm+WZUR7Ga9TnhYLryjScR2kopftoSrxnNWm+JsbUntLdrxxbFD54MizdJfG5XDI+mCt+5jbYwY+rWT1ET3EQEexb7NR4y8GYCdpNfakcbAt6STXR1pBOby5UIf0WD76dyytOUAEOjXvq+xMtAXnuf6CGMp3FItytMODsr6d1u1KNDCUYJrxSBLqbPn5gX13YqzzUtfqrcYgsxzBS1ZgGV760ILsO1w/28C7Va72/YwMloe2Nc04mH+8x1dtO1Vdfcbk54fS7A4a2kJPWGqr639wLlMNTrS1+RnO2t+uf8lWHX7/vd24Mt/</diagram></mxfile>"><defs/><g><rect x="420" y="20" width="240" height="60" fill="#fff2cc" stroke="#d6b656" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 25px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">URL</div></div></div></foreignObject><text x="270" y="29" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">URL</text></switch></g><rect x="380" y="320" width="320" height="120" rx="18" ry="18" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe center; width: 158px; height: 1px; padding-top: 167px; margin-left: 191px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Parallel Fetch</div></div></div></foreignObject><text x="270" y="179" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Parallel Fetch</text></switch></g><rect x="340" y="480" width="400" height="420" rx="60" ry="60" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe center; width: 198px; height: 1px; padding-top: 247px; margin-left: 171px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Parse</div></div></div></foreignObject><text x="270" y="259" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Parse</text></switch></g><path d="M 540 444 L 540 467.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 477.76 L 533 463.76 L 540 467.26 L 547 463.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="20" y="480" width="240" height="420" rx="36" ry="36" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 345px; margin-left: 11px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Custom Parser</div></div></div></foreignObject><text x="70" y="349" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Custom Parser</text></switch></g><path d="M 272.74 690 L 340 690" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 262.24 690 L 276.24 683 L 272.74 690 L 276.24 697 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="420" y="540" width="240" height="60" rx="9" ry="9" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 285px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Link Filter</div></div></div></foreignObject><text x="270" y="289" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Link Filter</text></switch></g><rect x="420" y="630" width="240" height="60" rx="9" ry="9" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 330px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Parse Link</div></div></div></foreignObject><text x="270" y="334" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Parse Link</text></switch></g><rect x="420" y="720" width="240" height="60" rx="9" ry="9" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 375px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Page Data</div></div></div></foreignObject><text x="270" y="379" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Page Data</text></switch></g><path d="M 540 600 L 540 617.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 627.76 L 533 613.76 L 540 617.26 L 547 613.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 480 152 C 480 109.33 600 109.33 600 152 L 600 248 C 600 290.67 480 290.67 480 248 Z" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 480 152 C 480 184 600 184 600 152" fill="none" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 112px; margin-left: 241px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">GenStage<br />Queue</div></div></div></foreignObject><text x="270" y="116" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">GenStage...</text></switch></g><path d="M 540 80 L 540 107.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 117.76 L 533 103.76 L 540 107.26 L 547 103.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 540 280 L 540 307.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 317.76 L 533 303.76 L 540 307.26 L 547 303.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 660 660 L 980 660 Q 1000 660 1000 640 L 1000 220 Q 1000 200 980 200 L 612.74 200" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 602.24 200 L 616.24 193 L 612.74 200 L 616.24 207 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 842 C 800 799.33 920 799.33 920 842 L 920 938 C 920 980.67 800 980.67 800 938 Z" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 842 C 800 874 920 874 920 842" fill="none" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 457px; margin-left: 401px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Data Store</div></div></div></foreignObject><text x="430" y="461" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Data Store</text></switch></g><path d="M 740 776.52 L 840 776.09 Q 860 776 860 786.63 L 860 797.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 860 807.76 L 853 793.76 L 860 797.26 L 867 793.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 512 C 800 469.33 920 469.33 920 512 L 920 608 C 920 650.67 800 650.67 800 608 Z" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 800 512 C 800 544 920 544 920 512" fill="none" stroke="#6c8ebf" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 292px; margin-left: 401px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Disk</div></div></div></foreignObject><text x="430" y="296" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Disk</text></switch></g><path d="M 700 380 L 747.27 379.21" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 757.76 379.04 L 743.88 386.27 L 747.27 379.21 L 743.65 372.27 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="440" y="380" width="200" height="40" rx="6" ry="6" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 98px; height: 1px; padding-top: 200px; margin-left: 221px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Retry Strategy</div></div></div></foreignObject><text x="270" y="204" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Retry Strategy</text></switch></g><rect x="20" y="20" width="220" height="220" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe center; width: 108px; height: 1px; padding-top: 17px; margin-left: 11px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><font style="font-size: 10px">Legend</font></div></div></div></foreignObject><text x="65" y="29" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Legend</text></switch></g><rect x="35.71" y="70.77" width="188.57" height="67.69" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 92px; height: 1px; padding-top: 52px; margin-left: 19px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Crawler Functionality</div></div></div></foreignObject><text x="65" y="56" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Crawler Function...</text></switch></g><rect x="35.71" y="155.38" width="188.57" height="67.69" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 92px; height: 1px; padding-top: 95px; margin-left: 19px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Customisable Functionality</div></div></div></foreignObject><text x="65" y="98" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Customisable Fun...</text></switch></g><rect x="760" y="334" width="200" height="90" rx="13.5" ry="13.5" fill="#dae8fc" stroke="#6c8ebf" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 98px; height: 1px; padding-top: 190px; margin-left: 381px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Offline<br />File Handler</div></div></div></foreignObject><text x="430" y="193" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Offline...</text></switch></g><path d="M 860 424 L 860 467.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 860 477.76 L 853 463.76 L 860 467.26 L 867 463.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="380" y="960" width="320" height="120" fill="#d5e8d4" stroke="#82b366" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 158px; height: 1px; padding-top: 510px; margin-left: 191px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Consumer Application<br /><font style="font-size: 10px">(with multiple ways of <br />consuming Crawler data)</font></div></div></div></foreignObject><text x="270" y="514" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Consumer Application...</text></switch></g><path d="M 140 900 L 140 1000 Q 140 1020 160 1020 L 367.26 1020" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 377.76 1020 L 363.76 1027 L 367.26 1020 L 363.76 1013 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 540 900 L 540 947.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 540 957.76 L 533 943.76 L 540 947.26 L 547 943.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><path d="M 860 970 L 860 1000 Q 860 1020 840 1020 L 712.74 1020" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" stroke-dasharray="6 6" pointer-events="stroke"/><path d="M 702.24 1020 L 716.24 1013 L 712.74 1020 L 716.24 1027 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/><rect x="420" y="810" width="240" height="60" rx="9" ry="9" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" pointer-events="all"/><g transform="translate(-0.5 -0.5)scale(2)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 118px; height: 1px; padding-top: 420px; margin-left: 211px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; white-space: normal; overflow-wrap: normal;">Scraper</div></div></div></foreignObject><text x="270" y="424" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold">Scraper</text></switch></g><path d="M 540 780 L 540 797.26" fill="none" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 540 807.76 L 533 793.76 L 540 797.26 L 547 793.76 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-width="2" stroke-miterlimit="10" pointer-events="all"/></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.drawio.com/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
defmodule Crawler.Example.GoogleSearch.Scraper do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
We only scrape Github pages, specifically looking for a project's name and description.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
@behaviour Crawler.Scraper.Spec
|
|
7
|
+
|
|
8
|
+
alias Crawler.Store.Page
|
|
9
|
+
alias Crawler.Example.GoogleSearch.Data
|
|
10
|
+
|
|
11
|
+
def scrape(%Page{url: "https://github.com" <> _ = url, body: body, opts: _opts} = page) do
|
|
12
|
+
doc =
|
|
13
|
+
body
|
|
14
|
+
|> Floki.parse_document!()
|
|
15
|
+
|
|
16
|
+
name =
|
|
17
|
+
doc
|
|
18
|
+
|> Floki.find("#repository-container-header strong a")
|
|
19
|
+
|> Floki.text()
|
|
20
|
+
|
|
21
|
+
desc =
|
|
22
|
+
doc
|
|
23
|
+
|> Floki.find(".Layout-sidebar p.f4")
|
|
24
|
+
|> Floki.text()
|
|
25
|
+
|> String.trim()
|
|
26
|
+
|
|
27
|
+
if name != "" do
|
|
28
|
+
Agent.update(Data, fn state ->
|
|
29
|
+
Map.merge(state, %{name => %{url: url, desc: desc}})
|
|
30
|
+
end)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
{:ok, page}
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def scrape(page), do: {:ok, page}
|
|
37
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
defmodule Crawler.Example.GoogleSearch.UrlFilter do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
We start with Google, then only crawls Github.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
@behaviour Crawler.Fetcher.UrlFilter.Spec
|
|
7
|
+
|
|
8
|
+
def filter("https://www.google.com" <> _, _opts), do: {:ok, true}
|
|
9
|
+
def filter("https://github.com" <> _, _opts), do: {:ok, true}
|
|
10
|
+
def filter(_url, _opts), do: {:ok, false}
|
|
11
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
defmodule Crawler.Example.GoogleSearch do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
This example performs a Google search, then scrapes the results to find Github
|
|
4
|
+
projects and output their name and description.
|
|
5
|
+
|
|
6
|
+
Example output:
|
|
7
|
+
|
|
8
|
+
Agent.get(Data, & &1) #=> %{
|
|
9
|
+
"crawler" => %{
|
|
10
|
+
desc: "A high performance web crawler / scraper in Elixir.",
|
|
11
|
+
url: "https://github.com/fredwu/crawler"
|
|
12
|
+
},
|
|
13
|
+
"crawly" => %{
|
|
14
|
+
desc: "Crawly, a high-level web crawling & scraping framework for Elixir.",
|
|
15
|
+
url: "https://github.com/elixir-crawly/crawly"
|
|
16
|
+
},
|
|
17
|
+
"elixir_scraper" => %{
|
|
18
|
+
desc: "Elixir/Hound web scraper example",
|
|
19
|
+
url: "https://github.com/jaydorsey/elixir_scraper"
|
|
20
|
+
},
|
|
21
|
+
"mechanize" => %{
|
|
22
|
+
desc: "Build web scrapers and automate interaction with websites in Elixir with ease!",
|
|
23
|
+
url: "https://github.com/gushonorato/mechanize"
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
alias Crawler.Example.GoogleSearch.Data
|
|
29
|
+
alias Crawler.Example.GoogleSearch.Scraper
|
|
30
|
+
alias Crawler.Example.GoogleSearch.UrlFilter
|
|
31
|
+
|
|
32
|
+
@site_url "https://www.google.com/search?"
|
|
33
|
+
@search_term "github web scrapers in Elixir"
|
|
34
|
+
|
|
35
|
+
def run do
|
|
36
|
+
Agent.start_link(fn -> %{} end, name: Data)
|
|
37
|
+
|
|
38
|
+
# Do not crawl Google too fast, or you will get blocked
|
|
39
|
+
|
|
40
|
+
{:ok, opts} =
|
|
41
|
+
Crawler.crawl(
|
|
42
|
+
search_url(),
|
|
43
|
+
workers: 2,
|
|
44
|
+
max_depths: 2,
|
|
45
|
+
max_pages: 10,
|
|
46
|
+
interval: 80,
|
|
47
|
+
user_agent: "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/118.0",
|
|
48
|
+
scraper: Scraper,
|
|
49
|
+
url_filter: UrlFilter
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
wait(fn ->
|
|
53
|
+
false = Crawler.running?(opts)
|
|
54
|
+
|
|
55
|
+
# give the scraper time to finish
|
|
56
|
+
Process.sleep(2_000)
|
|
57
|
+
|
|
58
|
+
dbg(Agent.get(Data, & &1))
|
|
59
|
+
end)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
defp search_url do
|
|
63
|
+
@site_url <> URI.encode_query(%{"q" => @search_term})
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
defp wait(fun), do: wait(5_000, fun)
|
|
67
|
+
|
|
68
|
+
defp wait(timeout, fun) do
|
|
69
|
+
try do
|
|
70
|
+
fun.()
|
|
71
|
+
rescue
|
|
72
|
+
_ ->
|
|
73
|
+
:timer.sleep(500)
|
|
74
|
+
wait(max(0, timeout - 500), fun)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
defmodule Crawler.Dispatcher.Worker do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
A worker that performs the crawling.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
@doc """
|
|
7
|
+
Kicks off `Crawler.crawl_now/1`.
|
|
8
|
+
"""
|
|
9
|
+
def start_link(opts) do
|
|
10
|
+
Task.start_link(fn ->
|
|
11
|
+
Crawler.crawl_now(opts)
|
|
12
|
+
end)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
defmodule Crawler.Dispatcher do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Dispatches requests to a queue for crawling.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
@doc """
|
|
7
|
+
Takes the `request` argument which is a tuple containing either:
|
|
8
|
+
|
|
9
|
+
- `{_, link, _, url}` when it's a link that got transformed into a URL
|
|
10
|
+
- `{_, url}` when it's a URL already
|
|
11
|
+
|
|
12
|
+
And issues `Crawler.crawl/2` to initiate the crawl.
|
|
13
|
+
"""
|
|
14
|
+
def dispatch(request, opts) do
|
|
15
|
+
case request do
|
|
16
|
+
{_, _link, _, url} -> Crawler.crawl(url, opts)
|
|
17
|
+
{_, url} -> Crawler.crawl(url, opts)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
defmodule Crawler.Fetcher.HeaderPreparer do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Captures and prepares HTTP response headers.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
@default_content_type "text/html"
|
|
7
|
+
|
|
8
|
+
@doc """
|
|
9
|
+
Captures and prepares HTTP response headers.
|
|
10
|
+
|
|
11
|
+
## Examples
|
|
12
|
+
|
|
13
|
+
iex> HeaderPreparer.prepare(
|
|
14
|
+
iex> [{"Content-Type", "text/html"}],
|
|
15
|
+
iex> %{}
|
|
16
|
+
iex> )
|
|
17
|
+
%{headers: [{"Content-Type", "text/html"}], content_type: "text/html"}
|
|
18
|
+
|
|
19
|
+
iex> HeaderPreparer.prepare(
|
|
20
|
+
iex> [{"Content-Type", "text/css"}],
|
|
21
|
+
iex> %{}
|
|
22
|
+
iex> )
|
|
23
|
+
%{headers: [{"Content-Type", "text/css"}], content_type: "text/css"}
|
|
24
|
+
|
|
25
|
+
iex> HeaderPreparer.prepare(
|
|
26
|
+
iex> [{"Content-Type", "image/png; blah"}],
|
|
27
|
+
iex> %{}
|
|
28
|
+
iex> )
|
|
29
|
+
%{headers: [{"Content-Type", "image/png; blah"}], content_type: "image/png"}
|
|
30
|
+
"""
|
|
31
|
+
def prepare(headers, opts) do
|
|
32
|
+
content_type =
|
|
33
|
+
headers
|
|
34
|
+
|> get_content_type()
|
|
35
|
+
|> simplify_content_type()
|
|
36
|
+
|
|
37
|
+
opts
|
|
38
|
+
|> Map.put(:headers, headers)
|
|
39
|
+
|> Map.put(:content_type, content_type)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
defp get_content_type(nil), do: @default_content_type
|
|
43
|
+
|
|
44
|
+
defp get_content_type(headers) do
|
|
45
|
+
case Enum.find(headers, &find_content_type/1) do
|
|
46
|
+
{_, value} -> value
|
|
47
|
+
_ -> @default_content_type
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
defp find_content_type({header, _}) do
|
|
52
|
+
String.downcase(header) == "content-type"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
defp simplify_content_type(content_type) do
|
|
56
|
+
content_type
|
|
57
|
+
|> String.split(";", parts: 2)
|
|
58
|
+
|> Kernel.hd()
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
defmodule Crawler.Fetcher.Modifier do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Modifies request options and headers before dispatch.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
defmodule Spec do
|
|
7
|
+
@type url :: String.t()
|
|
8
|
+
@type header :: {String.t(), String.t()}
|
|
9
|
+
@type opts :: map
|
|
10
|
+
|
|
11
|
+
@callback headers(opts) :: list(header) | []
|
|
12
|
+
@callback opts(opts) :: keyword | []
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
@behaviour __MODULE__.Spec
|
|
16
|
+
|
|
17
|
+
@doc """
|
|
18
|
+
Allows modifying headers prior to making a crawl request.
|
|
19
|
+
|
|
20
|
+
## Example implementation
|
|
21
|
+
|
|
22
|
+
def headers(opts) do
|
|
23
|
+
if opts[:url] == "http://modifier" do
|
|
24
|
+
[{"Referer", "http://fetcher"}]
|
|
25
|
+
end
|
|
26
|
+
[]
|
|
27
|
+
end
|
|
28
|
+
"""
|
|
29
|
+
def headers(_opts), do: []
|
|
30
|
+
|
|
31
|
+
@doc """
|
|
32
|
+
Allows passing opts to httpPoison prior to making the crawl request
|
|
33
|
+
|
|
34
|
+
## Example implementation
|
|
35
|
+
|
|
36
|
+
def opts(opts) do
|
|
37
|
+
if opts[:url] == "http://modifier" do
|
|
38
|
+
# add a new pool to hackney
|
|
39
|
+
[hackney: [pool: :modifier]]
|
|
40
|
+
end
|
|
41
|
+
[]
|
|
42
|
+
end
|
|
43
|
+
"""
|
|
44
|
+
def opts(_opts), do: []
|
|
45
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
defmodule Crawler.Fetcher.Policer do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Checks a series of conditions to determine whether it is okay to continue.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
require Logger
|
|
7
|
+
|
|
8
|
+
alias Crawler.Store
|
|
9
|
+
|
|
10
|
+
@uri_schemes ["http", "https"]
|
|
11
|
+
@asset_extra_depth 2
|
|
12
|
+
|
|
13
|
+
@doc """
|
|
14
|
+
Checks a series of conditions to determine whether it is okay to continue,
|
|
15
|
+
i.e. to allow `Crawler.Fetcher.fetch/1` to begin its tasks.
|
|
16
|
+
"""
|
|
17
|
+
def police(opts) do
|
|
18
|
+
with {_, true} <- within_max_pages?(opts),
|
|
19
|
+
{_, true} <- within_fetch_depth?(opts),
|
|
20
|
+
{_, true} <- acceptable_uri_scheme?(opts),
|
|
21
|
+
{_, true} <- not_fetched_yet?(opts),
|
|
22
|
+
{_, true} <- perform_url_filtering(opts) do
|
|
23
|
+
{:ok, opts}
|
|
24
|
+
else
|
|
25
|
+
{fail_type, _} -> police_warn(fail_type, opts)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
defp within_max_pages?(%{max_pages: :infinity} = _opts), do: {:within_max_pages?, true}
|
|
30
|
+
|
|
31
|
+
defp within_max_pages?(%{max_pages: max_pages} = _opts) when is_integer(max_pages) do
|
|
32
|
+
{:within_max_pages?, Store.ops_count() < max_pages}
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
defp within_max_pages?(_opts), do: {:within_max_pages?, true}
|
|
36
|
+
|
|
37
|
+
defp within_fetch_depth?(%{depth: depth, max_depths: max_depths} = opts) do
|
|
38
|
+
max_depths =
|
|
39
|
+
case opts[:html_tag] do
|
|
40
|
+
"a" -> max_depths
|
|
41
|
+
_ -> max_depths + @asset_extra_depth
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
{:within_fetch_depth?, depth < max_depths}
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
defp within_fetch_depth?(_opts), do: {:within_fetch_depth?, true}
|
|
48
|
+
|
|
49
|
+
defp acceptable_uri_scheme?(%{url: url} = _opts) do
|
|
50
|
+
scheme =
|
|
51
|
+
url
|
|
52
|
+
|> String.split("://", parts: 2)
|
|
53
|
+
|> Kernel.hd()
|
|
54
|
+
|
|
55
|
+
{:acceptable_uri_scheme?, Enum.member?(@uri_schemes, scheme)}
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
defp acceptable_uri_scheme?(_opts), do: {:acceptable_uri_scheme?, true}
|
|
59
|
+
|
|
60
|
+
defp not_fetched_yet?(%{url: url, scope: scope} = _opts) do
|
|
61
|
+
{:not_fetched_yet?, !Store.find({url, scope})}
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
defp not_fetched_yet?(_opts), do: {:not_fetched_yet?, true}
|
|
65
|
+
|
|
66
|
+
defp perform_url_filtering(%{url_filter: url_filter, url: url} = opts) do
|
|
67
|
+
{:ok, pass_through?} = url_filter.filter(url, opts)
|
|
68
|
+
|
|
69
|
+
{:perform_url_filtering, pass_through?}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
defp perform_url_filtering(_opts), do: {:perform_url_filtering, true}
|
|
73
|
+
|
|
74
|
+
defp police_warn(fail_type, opts) do
|
|
75
|
+
{:warn, "Fetch failed check '#{fail_type}', with opts: #{Kernel.inspect(opts)}."}
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
defmodule Crawler.Fetcher.Recorder do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Records information about each crawl for internal use.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Store
|
|
7
|
+
|
|
8
|
+
@doc """
|
|
9
|
+
Records information about each crawl for internal use.
|
|
10
|
+
|
|
11
|
+
## Examples
|
|
12
|
+
|
|
13
|
+
iex> Recorder.record(url: "url1", depth: 2)
|
|
14
|
+
{:ok, %{depth: 3, url: "url1"}}
|
|
15
|
+
|
|
16
|
+
iex> Recorder.record(url: "url2", depth: 2)
|
|
17
|
+
iex> Store.find({"url2", nil})
|
|
18
|
+
%Page{url: "url2"}
|
|
19
|
+
"""
|
|
20
|
+
def record(opts) do
|
|
21
|
+
with opts <- Enum.into(opts, %{}),
|
|
22
|
+
{:ok, _pid} <- store_url(opts),
|
|
23
|
+
opts <- store_url_depth(opts) do
|
|
24
|
+
{:ok, opts}
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
@doc """
|
|
29
|
+
Stores page data in `Crawler.Store.DB` for internal or external consumption, if enabled.
|
|
30
|
+
|
|
31
|
+
## Examples
|
|
32
|
+
|
|
33
|
+
iex> Recorder.maybe_store_page("body", %{store: nil})
|
|
34
|
+
{:ok, nil}
|
|
35
|
+
|
|
36
|
+
iex> Recorder.record(url: "url", depth: 2)
|
|
37
|
+
iex> Recorder.maybe_store_page("body", %{store: Store, url: "url", scope: nil})
|
|
38
|
+
{:ok, {%Page{url: "url", body: "body", opts: %{store: Store, url: "url", scope: nil}}, %Page{url: "url", body: nil}}}
|
|
39
|
+
"""
|
|
40
|
+
def maybe_store_page(_body, %{store: nil} = _opts) do
|
|
41
|
+
{:ok, nil}
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def maybe_store_page(body, opts) do
|
|
45
|
+
{:ok, opts[:store].add_page_data({opts[:url], opts[:scope]}, body, opts)}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
defp store_url(opts) do
|
|
49
|
+
Store.add({opts[:url], opts[:scope]})
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
defp store_url_depth(opts) do
|
|
53
|
+
Map.replace!(opts, :depth, opts[:depth] + 1)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
defmodule Crawler.Fetcher.Requester do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Makes HTTP requests.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.HTTP
|
|
7
|
+
|
|
8
|
+
@fetch_opts [
|
|
9
|
+
follow_redirect: true,
|
|
10
|
+
max_redirect: 5
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
@doc """
|
|
14
|
+
Makes HTTP requests via `Crawler.HTTP`.
|
|
15
|
+
|
|
16
|
+
## Examples
|
|
17
|
+
|
|
18
|
+
iex> Requester.make(url: "fake.url", modifier: Crawler.Fetcher.Modifier)
|
|
19
|
+
{:error, %HTTPoison.Error{id: nil, reason: :nxdomain}}
|
|
20
|
+
"""
|
|
21
|
+
def make(opts) do
|
|
22
|
+
HTTP.get(opts[:url], fetch_headers(opts), fetch_opts(opts))
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
defp fetch_headers(opts) do
|
|
26
|
+
[{"User-Agent", opts[:user_agent]}] ++ opts[:modifier].headers(opts)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
defp fetch_opts(opts) do
|
|
30
|
+
@fetch_opts ++ [recv_timeout: opts[:timeout]] ++ opts[:modifier].opts(opts)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
defmodule Crawler.Fetcher.Retrier do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Handles retries for failed crawls.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
defmodule Spec do
|
|
7
|
+
@moduledoc """
|
|
8
|
+
Spec for defining a fetch retrier.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
@type fetch_url :: fun
|
|
12
|
+
@type opts :: map
|
|
13
|
+
|
|
14
|
+
@callback perform(fetch_url, opts) :: term
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
use Retry
|
|
18
|
+
|
|
19
|
+
@behaviour __MODULE__.Spec
|
|
20
|
+
|
|
21
|
+
@doc """
|
|
22
|
+
More information: [https://github.com/safwank/ElixirRetry](https://github.com/safwank/ElixirRetry)
|
|
23
|
+
"""
|
|
24
|
+
def perform(fetch_url, opts) do
|
|
25
|
+
retry with:
|
|
26
|
+
exponential_backoff()
|
|
27
|
+
|> expiry(timeout_value(opts[:timeout]))
|
|
28
|
+
|> Stream.take(opts[:retries]) do
|
|
29
|
+
fetch_url.()
|
|
30
|
+
after
|
|
31
|
+
result -> result
|
|
32
|
+
else
|
|
33
|
+
error -> error
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
defp timeout_value(value) do
|
|
38
|
+
case Kernel.is_integer(value) do
|
|
39
|
+
true -> value
|
|
40
|
+
false -> 5_000
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
defmodule Crawler.Fetcher.UrlFilter do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
A placeholder module that lets all URLs pass through.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
defmodule Spec do
|
|
7
|
+
@moduledoc """
|
|
8
|
+
Spec for defining an url filter.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
@type url :: String.t()
|
|
12
|
+
@type opts :: map
|
|
13
|
+
|
|
14
|
+
@callback filter(url, opts) :: {:ok, boolean} | {:error, term}
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
@behaviour __MODULE__.Spec
|
|
18
|
+
|
|
19
|
+
@doc """
|
|
20
|
+
Whether to pass through a given URL.
|
|
21
|
+
|
|
22
|
+
- `true` for letting the url through
|
|
23
|
+
- `false` for rejecting the url
|
|
24
|
+
"""
|
|
25
|
+
def filter(_url, _opts), do: {:ok, true}
|
|
26
|
+
end
|