powerdlz23 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Spider/README.md +19 -0
- package/Spider/domain.py +18 -0
- package/Spider/general.py +51 -0
- package/Spider/link_finder.py +25 -0
- package/Spider/main.py +50 -0
- package/Spider/spider.py +74 -0
- package/crawler/.formatter.exs +5 -0
- package/crawler/.github/workflows/ci.yml +29 -0
- package/crawler/.recode.exs +33 -0
- package/crawler/.tool-versions +2 -0
- package/crawler/CHANGELOG.md +82 -0
- package/crawler/README.md +198 -0
- package/crawler/architecture.svg +4 -0
- package/crawler/config/config.exs +9 -0
- package/crawler/config/dev.exs +5 -0
- package/crawler/config/test.exs +5 -0
- package/crawler/examples/google_search/scraper.ex +37 -0
- package/crawler/examples/google_search/url_filter.ex +11 -0
- package/crawler/examples/google_search.ex +77 -0
- package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
- package/crawler/lib/crawler/dispatcher.ex +20 -0
- package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
- package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
- package/crawler/lib/crawler/fetcher/policer.ex +77 -0
- package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
- package/crawler/lib/crawler/fetcher/requester.ex +32 -0
- package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
- package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
- package/crawler/lib/crawler/fetcher.ex +81 -0
- package/crawler/lib/crawler/http.ex +7 -0
- package/crawler/lib/crawler/linker/path_builder.ex +71 -0
- package/crawler/lib/crawler/linker/path_expander.ex +59 -0
- package/crawler/lib/crawler/linker/path_finder.ex +106 -0
- package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
- package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
- package/crawler/lib/crawler/linker.ex +173 -0
- package/crawler/lib/crawler/options.ex +127 -0
- package/crawler/lib/crawler/parser/css_parser.ex +37 -0
- package/crawler/lib/crawler/parser/guarder.ex +38 -0
- package/crawler/lib/crawler/parser/html_parser.ex +41 -0
- package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
- package/crawler/lib/crawler/parser/link_parser.ex +50 -0
- package/crawler/lib/crawler/parser.ex +122 -0
- package/crawler/lib/crawler/queue_handler.ex +45 -0
- package/crawler/lib/crawler/scraper.ex +28 -0
- package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
- package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
- package/crawler/lib/crawler/snapper.ex +82 -0
- package/crawler/lib/crawler/store/counter.ex +19 -0
- package/crawler/lib/crawler/store/page.ex +7 -0
- package/crawler/lib/crawler/store.ex +87 -0
- package/crawler/lib/crawler/worker.ex +62 -0
- package/crawler/lib/crawler.ex +91 -0
- package/crawler/mix.exs +78 -0
- package/crawler/mix.lock +40 -0
- package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
- package/crawler/test/integration_test.exs +135 -0
- package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
- package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
- package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
- package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
- package/crawler/test/lib/crawler/http_test.exs +47 -0
- package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
- package/crawler/test/lib/crawler/linker_test.exs +7 -0
- package/crawler/test/lib/crawler/options_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser_test.exs +8 -0
- package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
- package/crawler/test/lib/crawler/scraper_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper_test.exs +9 -0
- package/crawler/test/lib/crawler/worker_test.exs +5 -0
- package/crawler/test/lib/crawler_test.exs +295 -0
- package/crawler/test/support/test_case.ex +24 -0
- package/crawler/test/support/test_helpers.ex +28 -0
- package/crawler/test/test_helper.exs +7 -0
- package/package.json +1 -1
- package/pto/CryptoNoter/.gitattributes +2 -0
- package/pto/CryptoNoter/CryptoNight.md +444 -0
- package/pto/CryptoNoter/CryptoNight.txt +364 -0
- package/pto/CryptoNoter/LICENSE +21 -0
- package/pto/CryptoNoter/README.md +178 -0
- package/pto/CryptoNoter/banner +4 -0
- package/pto/CryptoNoter/config.json +8 -0
- package/pto/CryptoNoter/install.sh +60 -0
- package/pto/CryptoNoter/package-lock.json +33 -0
- package/pto/CryptoNoter/package.json +16 -0
- package/pto/CryptoNoter/server.js +225 -0
- package/pto/CryptoNoter/web/demo.html +81 -0
- package/pto/CryptoNoter/web/index.html +1 -0
- package/pto/CryptoNoter/web/lib/cryptonight-asmjs.min.js +16891 -0
- package/pto/CryptoNoter/web/lib/cryptonight-asmjs.min.js.mem +0 -0
- package/pto/CryptoNoter/web/lib/cryptonight.wasm +0 -0
- package/pto/CryptoNoter/web/processor.js +496 -0
- package/pto/CryptoNoter/web/worker.js +5549 -0
- package/pto/crypto/README.md +1 -0
- package/pto/crypto/aes256cbc/README.md +59 -0
- package/pto/crypto/aes256cbc/aes256cbc.go +172 -0
- package/pto/crypto/aes256cbc/aes256cbc_test.go +105 -0
- package/pto/crypto/aes256cbc/examples_test.go +30 -0
- package/pto/crypto/dh64/README.md +84 -0
- package/pto/crypto/dh64/c/dh64.c +75 -0
- package/pto/crypto/dh64/c/dh64.h +12 -0
- package/pto/crypto/dh64/c/dh64_test.c +30 -0
- package/pto/crypto/dh64/csharp/dh64.cs +77 -0
- package/pto/crypto/dh64/csharp/dh64_test.cs +1074 -0
- package/pto/crypto/dh64/go/dh64.go +72 -0
- package/pto/crypto/dh64/go/dh64_test.go +1064 -0
- package/pto/crypto/mt19937/README.md +30 -0
- package/pto/crypto/mt19937/c/mt19937-64.c +180 -0
- package/pto/crypto/mt19937/c/mt19937-64.h +96 -0
- package/pto/crypto/mt19937/c/mt19937-64.out.txt +401 -0
- package/pto/crypto/mt19937/c/mt19937-64test.c +78 -0
- package/pto/crypto/mt19937/csharp/mt19937.cs +139 -0
- package/pto/crypto/mt19937/csharp/mt19937_test.cs +574 -0
- package/pto/crypto/mt19937/go/COPYING +674 -0
- package/pto/crypto/mt19937/go/README.rst +103 -0
- package/pto/crypto/mt19937/go/doc.go +35 -0
- package/pto/crypto/mt19937/go/example.go +32 -0
- package/pto/crypto/mt19937/go/mt19937.go +149 -0
- package/pto/crypto/mt19937/go/mt19937_test.go +614 -0
- package/pto/crypto/rc4/README.md +14 -0
- package/pto/crypto/rc4/csharp/rc4.cs +119 -0
- package/pto/crypto/rc4/csharp/rc4_echo_client.cs +78 -0
- package/pto/crypto/rc4/go/rc4_echo_client.go +102 -0
- package/pto/crypto/rc4/go/rc4_echo_server.go +110 -0
- package/rubyretriever/.rspec +2 -0
- package/rubyretriever/.travis.yml +7 -0
- package/rubyretriever/Gemfile +3 -0
- package/rubyretriever/Gemfile.lock +64 -0
- package/rubyretriever/LICENSE +20 -0
- package/rubyretriever/Rakefile +7 -0
- package/rubyretriever/bin/rr +79 -0
- package/rubyretriever/lib/retriever/cli.rb +25 -0
- package/rubyretriever/lib/retriever/core_ext.rb +13 -0
- package/rubyretriever/lib/retriever/fetch.rb +268 -0
- package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
- package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
- package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
- package/rubyretriever/lib/retriever/link.rb +47 -0
- package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
- package/rubyretriever/lib/retriever/page.rb +104 -0
- package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
- package/rubyretriever/lib/retriever/target.rb +47 -0
- package/rubyretriever/lib/retriever/version.rb +4 -0
- package/rubyretriever/lib/retriever.rb +15 -0
- package/rubyretriever/readme.md +166 -0
- package/rubyretriever/rubyretriever.gemspec +41 -0
- package/rubyretriever/spec/link_spec.rb +77 -0
- package/rubyretriever/spec/page_spec.rb +94 -0
- package/rubyretriever/spec/retriever_spec.rb +84 -0
- package/rubyretriever/spec/spec_helper.rb +17 -0
- package/rubyretriever/spec/target_spec.rb +55 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
defmodule CrawlerTest do
|
|
2
|
+
use Crawler.TestCase, async: false
|
|
3
|
+
|
|
4
|
+
alias Crawler.Store
|
|
5
|
+
|
|
6
|
+
@moduletag capture_log: true
|
|
7
|
+
|
|
8
|
+
doctest Crawler
|
|
9
|
+
|
|
10
|
+
test ".crawl", %{bypass: bypass, url: url} do
|
|
11
|
+
Store.ops_reset()
|
|
12
|
+
|
|
13
|
+
url = "#{url}/crawler"
|
|
14
|
+
linked_url1 = "#{url}/link1"
|
|
15
|
+
linked_url2 = "#{url}/link2"
|
|
16
|
+
linked_url3 = "#{url}/link3"
|
|
17
|
+
linked_url4 = "#{url}/link4"
|
|
18
|
+
|
|
19
|
+
Bypass.expect_once(bypass, "GET", "/crawler", fn conn ->
|
|
20
|
+
Plug.Conn.resp(conn, 200, """
|
|
21
|
+
<html><a href="#{linked_url1}">1</a></html>
|
|
22
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
23
|
+
""")
|
|
24
|
+
end)
|
|
25
|
+
|
|
26
|
+
Bypass.expect_once(bypass, "GET", "/crawler/link1", fn conn ->
|
|
27
|
+
Plug.Conn.resp(conn, 200, """
|
|
28
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
29
|
+
""")
|
|
30
|
+
end)
|
|
31
|
+
|
|
32
|
+
Bypass.expect_once(bypass, "GET", "/crawler/link2", fn conn ->
|
|
33
|
+
Plug.Conn.resp(conn, 200, """
|
|
34
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
35
|
+
""")
|
|
36
|
+
end)
|
|
37
|
+
|
|
38
|
+
Bypass.expect_once(bypass, "GET", "/crawler/link3", fn conn ->
|
|
39
|
+
Plug.Conn.resp(conn, 200, """
|
|
40
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
41
|
+
""")
|
|
42
|
+
end)
|
|
43
|
+
|
|
44
|
+
{:ok, opts} = Crawler.crawl(url, max_depths: 3, workers: 3, interval: 100, store: Store)
|
|
45
|
+
|
|
46
|
+
assert Crawler.running?(opts)
|
|
47
|
+
|
|
48
|
+
Crawler.pause(opts)
|
|
49
|
+
|
|
50
|
+
refute Crawler.running?(opts)
|
|
51
|
+
|
|
52
|
+
assert opts[:workers] == 3
|
|
53
|
+
|
|
54
|
+
Crawler.resume(opts)
|
|
55
|
+
|
|
56
|
+
assert Crawler.running?(opts)
|
|
57
|
+
|
|
58
|
+
wait(fn ->
|
|
59
|
+
assert Store.ops_count() == 4
|
|
60
|
+
end)
|
|
61
|
+
|
|
62
|
+
wait(fn ->
|
|
63
|
+
assert %Store.Page{url: ^url, opts: %{workers: 3}} = Store.find_processed({url, nil})
|
|
64
|
+
|
|
65
|
+
assert Store.find_processed({linked_url1, nil})
|
|
66
|
+
assert Store.find_processed({linked_url2, nil})
|
|
67
|
+
assert Store.find_processed({linked_url3, nil})
|
|
68
|
+
refute Store.find({linked_url4, nil})
|
|
69
|
+
|
|
70
|
+
urls = Crawler.Store.all_urls()
|
|
71
|
+
|
|
72
|
+
assert Enum.member?(urls, {url, nil})
|
|
73
|
+
assert Enum.member?(urls, {linked_url1, nil})
|
|
74
|
+
assert Enum.member?(urls, {linked_url2, nil})
|
|
75
|
+
assert Enum.member?(urls, {linked_url3, nil})
|
|
76
|
+
refute Enum.member?(urls, {linked_url4, nil})
|
|
77
|
+
end)
|
|
78
|
+
|
|
79
|
+
wait(fn ->
|
|
80
|
+
refute Crawler.running?(opts)
|
|
81
|
+
assert OPQ.info(opts[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 3}
|
|
82
|
+
end)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
test ".crawl without a store", %{bypass: bypass, url: url} do
|
|
86
|
+
url = "#{url}/crawler_without_store"
|
|
87
|
+
|
|
88
|
+
Bypass.expect_once(bypass, "GET", "/crawler_without_store", fn conn ->
|
|
89
|
+
Plug.Conn.resp(conn, 200, "200")
|
|
90
|
+
end)
|
|
91
|
+
|
|
92
|
+
{:ok, opts} = Crawler.crawl(url, max_depths: 1, workers: 1, interval: 100, store: nil)
|
|
93
|
+
|
|
94
|
+
wait(fn ->
|
|
95
|
+
assert %Store.Page{url: ^url, body: nil, opts: nil} = Store.find_processed({url, nil})
|
|
96
|
+
end)
|
|
97
|
+
|
|
98
|
+
wait(fn ->
|
|
99
|
+
assert OPQ.info(opts[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 1}
|
|
100
|
+
end)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
test ".crawl with max_pages", %{bypass: bypass, url: url} do
|
|
104
|
+
Store.ops_reset()
|
|
105
|
+
|
|
106
|
+
url = "#{url}/crawler_with_max_pages"
|
|
107
|
+
linked_url1 = "#{url}/link1"
|
|
108
|
+
linked_url2 = "#{url}/link2"
|
|
109
|
+
linked_url3 = "#{url}/link3"
|
|
110
|
+
linked_url4 = "#{url}/link4"
|
|
111
|
+
linked_url5 = "#{url}/link5"
|
|
112
|
+
|
|
113
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_max_pages", fn conn ->
|
|
114
|
+
Plug.Conn.resp(conn, 200, """
|
|
115
|
+
<html><a href="#{linked_url1}">1</a></html>
|
|
116
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
117
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
118
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
119
|
+
<html><a href="#{linked_url5}">5</a></html>
|
|
120
|
+
""")
|
|
121
|
+
end)
|
|
122
|
+
|
|
123
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_max_pages/link1", fn conn ->
|
|
124
|
+
Plug.Conn.resp(conn, 200, """
|
|
125
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
126
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
127
|
+
""")
|
|
128
|
+
end)
|
|
129
|
+
|
|
130
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_max_pages/link2", fn conn ->
|
|
131
|
+
Plug.Conn.resp(conn, 200, """
|
|
132
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
133
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
134
|
+
<html><a href="#{linked_url5}">5</a></html>
|
|
135
|
+
""")
|
|
136
|
+
end)
|
|
137
|
+
|
|
138
|
+
Bypass.stub(bypass, "GET", "/crawler_with_max_pages/link3", fn conn ->
|
|
139
|
+
Plug.Conn.resp(conn, 200, """
|
|
140
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
141
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
142
|
+
<html><a href="#{linked_url5}">5</a></html>
|
|
143
|
+
""")
|
|
144
|
+
end)
|
|
145
|
+
|
|
146
|
+
Bypass.stub(bypass, "GET", "/crawler_with_max_pages/link4", fn conn ->
|
|
147
|
+
Plug.Conn.resp(conn, 200, """
|
|
148
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
149
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
150
|
+
<html><a href="#{linked_url5}">5</a></html>
|
|
151
|
+
""")
|
|
152
|
+
end)
|
|
153
|
+
|
|
154
|
+
{:ok, opts} = Crawler.crawl(url, max_depths: 3, force: true, workers: 4, max_pages: 3, interval: 100)
|
|
155
|
+
|
|
156
|
+
wait(fn ->
|
|
157
|
+
assert Store.ops_count() == 4
|
|
158
|
+
end)
|
|
159
|
+
|
|
160
|
+
wait(fn ->
|
|
161
|
+
assert Store.find_processed({url, opts[:scope]})
|
|
162
|
+
assert Store.find_processed({linked_url1, opts[:scope]})
|
|
163
|
+
assert Store.find_processed({linked_url2, opts[:scope]})
|
|
164
|
+
assert Store.find_processed({linked_url3, opts[:scope]})
|
|
165
|
+
refute Store.find({linked_url4, opts[:scope]})
|
|
166
|
+
refute Store.find({linked_url5, opts[:scope]})
|
|
167
|
+
end)
|
|
168
|
+
|
|
169
|
+
wait(fn ->
|
|
170
|
+
assert OPQ.info(opts[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 4}
|
|
171
|
+
end)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
test ".crawl with an existing queue", %{bypass: bypass, url: url} do
|
|
175
|
+
Store.ops_reset()
|
|
176
|
+
|
|
177
|
+
url = "#{url}/crawler_with_queue"
|
|
178
|
+
linked_url1 = "#{url}/link1"
|
|
179
|
+
linked_url2 = "#{url}/link2"
|
|
180
|
+
linked_url3 = "#{url}/link3"
|
|
181
|
+
linked_url4 = "#{url}/link4"
|
|
182
|
+
|
|
183
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_queue/link1", fn conn ->
|
|
184
|
+
Plug.Conn.resp(conn, 200, """
|
|
185
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
186
|
+
""")
|
|
187
|
+
end)
|
|
188
|
+
|
|
189
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_queue/link2", fn conn ->
|
|
190
|
+
Plug.Conn.resp(conn, 200, """
|
|
191
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
192
|
+
""")
|
|
193
|
+
end)
|
|
194
|
+
|
|
195
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_queue/link3", fn conn ->
|
|
196
|
+
Plug.Conn.resp(conn, 200, """
|
|
197
|
+
<html>ok</html>
|
|
198
|
+
""")
|
|
199
|
+
end)
|
|
200
|
+
|
|
201
|
+
{:ok, queue} = OPQ.init(worker: Crawler.Dispatcher.Worker, workers: 2, interval: 100)
|
|
202
|
+
|
|
203
|
+
{:ok, opts1} = Crawler.crawl(linked_url1, store: Store, queue: queue)
|
|
204
|
+
{:ok, opts2} = Crawler.crawl(linked_url2, store: Store, queue: queue)
|
|
205
|
+
|
|
206
|
+
wait(fn ->
|
|
207
|
+
assert Store.ops_count() == 3
|
|
208
|
+
end)
|
|
209
|
+
|
|
210
|
+
wait(fn ->
|
|
211
|
+
assert Store.find_processed({linked_url1, nil})
|
|
212
|
+
assert Store.find_processed({linked_url2, nil})
|
|
213
|
+
assert Store.find_processed({linked_url3, nil})
|
|
214
|
+
refute Store.find_processed({linked_url4, nil})
|
|
215
|
+
|
|
216
|
+
urls = Crawler.Store.all_urls()
|
|
217
|
+
|
|
218
|
+
assert Enum.member?(urls, {linked_url1, nil})
|
|
219
|
+
assert Enum.member?(urls, {linked_url2, nil})
|
|
220
|
+
assert Enum.member?(urls, {linked_url3, nil})
|
|
221
|
+
refute Enum.member?(urls, {linked_url4, nil})
|
|
222
|
+
end)
|
|
223
|
+
|
|
224
|
+
wait(fn ->
|
|
225
|
+
assert OPQ.info(opts1[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
|
|
226
|
+
assert OPQ.info(opts2[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
|
|
227
|
+
end)
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
test ".crawl forced", %{bypass: bypass, url: url} do
|
|
231
|
+
Store.ops_reset()
|
|
232
|
+
|
|
233
|
+
url = "#{url}/crawler_forced"
|
|
234
|
+
linked_url1 = "#{url}/link1"
|
|
235
|
+
linked_url2 = "#{url}/link2"
|
|
236
|
+
|
|
237
|
+
Bypass.expect(bypass, "GET", "/crawler_forced", fn conn ->
|
|
238
|
+
Plug.Conn.resp(conn, 200, """
|
|
239
|
+
<html><a href="#{linked_url1}">1</a></html>
|
|
240
|
+
<html><a href="#{linked_url1}">1</a></html>
|
|
241
|
+
""")
|
|
242
|
+
end)
|
|
243
|
+
|
|
244
|
+
Bypass.expect(bypass, "GET", "/crawler_forced/link1", fn conn ->
|
|
245
|
+
Plug.Conn.resp(conn, 200, """
|
|
246
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
247
|
+
""")
|
|
248
|
+
end)
|
|
249
|
+
|
|
250
|
+
Bypass.expect(bypass, "GET", "/crawler_forced/link2", fn conn ->
|
|
251
|
+
Plug.Conn.resp(conn, 200, """
|
|
252
|
+
<html>ok</html>
|
|
253
|
+
""")
|
|
254
|
+
end)
|
|
255
|
+
|
|
256
|
+
{:ok, opts1} = Crawler.crawl(url, force: true, workers: 1, interval: 100)
|
|
257
|
+
{:ok, opts2} = Crawler.crawl(url, force: true, workers: 2, interval: 100)
|
|
258
|
+
|
|
259
|
+
refute opts1[:scope] == opts2[:scope]
|
|
260
|
+
|
|
261
|
+
wait(fn ->
|
|
262
|
+
assert Store.find_processed({url, opts1[:scope]})
|
|
263
|
+
assert Store.find_processed({url, opts2[:scope]})
|
|
264
|
+
assert Store.find_processed({linked_url1, opts1[:scope]})
|
|
265
|
+
assert Store.find_processed({linked_url1, opts2[:scope]})
|
|
266
|
+
assert Store.find_processed({linked_url2, opts1[:scope]})
|
|
267
|
+
assert Store.find_processed({linked_url2, opts2[:scope]})
|
|
268
|
+
|
|
269
|
+
assert Store.ops_count() >= 6
|
|
270
|
+
assert Store.ops_count() <= 10
|
|
271
|
+
|
|
272
|
+
assert OPQ.info(opts1[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 1}
|
|
273
|
+
assert OPQ.info(opts2[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
|
|
274
|
+
end)
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
test ".crawl stopped", %{bypass: bypass, url: url} do
|
|
278
|
+
url = "#{url}/stop"
|
|
279
|
+
linked_url = "#{url}/stop1"
|
|
280
|
+
|
|
281
|
+
Bypass.expect_once(bypass, "GET", "/stop", fn conn ->
|
|
282
|
+
Plug.Conn.resp(conn, 200, """
|
|
283
|
+
<html><a href="#{linked_url}">1</a></html>
|
|
284
|
+
""")
|
|
285
|
+
end)
|
|
286
|
+
|
|
287
|
+
{:ok, opts} = Crawler.crawl(url, workers: 1, interval: 500)
|
|
288
|
+
|
|
289
|
+
Process.sleep(200)
|
|
290
|
+
|
|
291
|
+
Crawler.stop(opts)
|
|
292
|
+
|
|
293
|
+
refute Store.find({linked_url, nil})
|
|
294
|
+
end
|
|
295
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
defmodule Crawler.TestCase do
|
|
2
|
+
use ExUnit.CaseTemplate
|
|
3
|
+
|
|
4
|
+
using do
|
|
5
|
+
quote do
|
|
6
|
+
import Crawler.TestHelpers
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
setup_all do
|
|
11
|
+
bypass = Bypass.open()
|
|
12
|
+
url = "http://localhost:#{bypass.port}"
|
|
13
|
+
path = "localhost-#{bypass.port}"
|
|
14
|
+
|
|
15
|
+
bypass2 = Bypass.open()
|
|
16
|
+
url2 = "http://localhost:#{bypass2.port}"
|
|
17
|
+
path2 = "localhost-#{bypass2.port}"
|
|
18
|
+
|
|
19
|
+
{
|
|
20
|
+
:ok,
|
|
21
|
+
bypass: bypass, url: url, path: path, bypass2: bypass2, url2: url2, path2: path2
|
|
22
|
+
}
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Credit: https://gist.github.com/cblavier/5e15791387a6e22b98d8
|
|
2
|
+
defmodule Crawler.TestHelpers do
|
|
3
|
+
def wait(fun), do: wait(500, fun)
|
|
4
|
+
def wait(0, fun), do: fun.()
|
|
5
|
+
|
|
6
|
+
def wait(timeout, fun) do
|
|
7
|
+
try do
|
|
8
|
+
fun.()
|
|
9
|
+
rescue
|
|
10
|
+
_ ->
|
|
11
|
+
:timer.sleep(10)
|
|
12
|
+
wait(max(0, timeout - 10), fun)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def tmp(path \\ "", filename \\ "") do
|
|
17
|
+
tmp_path = Path.join([File.cwd!(), "test", "tmp", path])
|
|
18
|
+
|
|
19
|
+
File.mkdir_p(tmp_path)
|
|
20
|
+
|
|
21
|
+
Path.join(tmp_path, filename)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def image_file do
|
|
25
|
+
{:ok, file} = File.read("test/fixtures/introducing-elixir.jpg")
|
|
26
|
+
file
|
|
27
|
+
end
|
|
28
|
+
end
|
package/package.json
CHANGED