powerdlz23 1.2.3 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Spider/README.md +19 -0
- package/Spider/domain.py +18 -0
- package/Spider/general.py +51 -0
- package/Spider/link_finder.py +25 -0
- package/Spider/main.py +50 -0
- package/Spider/spider.py +74 -0
- package/crawler/.formatter.exs +5 -0
- package/crawler/.github/workflows/ci.yml +29 -0
- package/crawler/.recode.exs +33 -0
- package/crawler/.tool-versions +2 -0
- package/crawler/CHANGELOG.md +82 -0
- package/crawler/README.md +198 -0
- package/crawler/architecture.svg +4 -0
- package/crawler/config/config.exs +9 -0
- package/crawler/config/dev.exs +5 -0
- package/crawler/config/test.exs +5 -0
- package/crawler/examples/google_search/scraper.ex +37 -0
- package/crawler/examples/google_search/url_filter.ex +11 -0
- package/crawler/examples/google_search.ex +77 -0
- package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
- package/crawler/lib/crawler/dispatcher.ex +20 -0
- package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
- package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
- package/crawler/lib/crawler/fetcher/policer.ex +77 -0
- package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
- package/crawler/lib/crawler/fetcher/requester.ex +32 -0
- package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
- package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
- package/crawler/lib/crawler/fetcher.ex +81 -0
- package/crawler/lib/crawler/http.ex +7 -0
- package/crawler/lib/crawler/linker/path_builder.ex +71 -0
- package/crawler/lib/crawler/linker/path_expander.ex +59 -0
- package/crawler/lib/crawler/linker/path_finder.ex +106 -0
- package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
- package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
- package/crawler/lib/crawler/linker.ex +173 -0
- package/crawler/lib/crawler/options.ex +127 -0
- package/crawler/lib/crawler/parser/css_parser.ex +37 -0
- package/crawler/lib/crawler/parser/guarder.ex +38 -0
- package/crawler/lib/crawler/parser/html_parser.ex +41 -0
- package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
- package/crawler/lib/crawler/parser/link_parser.ex +50 -0
- package/crawler/lib/crawler/parser.ex +122 -0
- package/crawler/lib/crawler/queue_handler.ex +45 -0
- package/crawler/lib/crawler/scraper.ex +28 -0
- package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
- package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
- package/crawler/lib/crawler/snapper.ex +82 -0
- package/crawler/lib/crawler/store/counter.ex +19 -0
- package/crawler/lib/crawler/store/page.ex +7 -0
- package/crawler/lib/crawler/store.ex +87 -0
- package/crawler/lib/crawler/worker.ex +62 -0
- package/crawler/lib/crawler.ex +91 -0
- package/crawler/mix.exs +78 -0
- package/crawler/mix.lock +40 -0
- package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
- package/crawler/test/integration_test.exs +135 -0
- package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
- package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
- package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
- package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
- package/crawler/test/lib/crawler/http_test.exs +47 -0
- package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
- package/crawler/test/lib/crawler/linker_test.exs +7 -0
- package/crawler/test/lib/crawler/options_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser_test.exs +8 -0
- package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
- package/crawler/test/lib/crawler/scraper_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper_test.exs +9 -0
- package/crawler/test/lib/crawler/worker_test.exs +5 -0
- package/crawler/test/lib/crawler_test.exs +295 -0
- package/crawler/test/support/test_case.ex +24 -0
- package/crawler/test/support/test_helpers.ex +28 -0
- package/crawler/test/test_helper.exs +7 -0
- package/grell/.rspec +2 -0
- package/grell/.travis.yml +28 -0
- package/grell/CHANGELOG.md +111 -0
- package/grell/Gemfile +7 -0
- package/grell/LICENSE.txt +22 -0
- package/grell/README.md +213 -0
- package/grell/Rakefile +2 -0
- package/grell/grell.gemspec +36 -0
- package/grell/lib/grell/capybara_driver.rb +44 -0
- package/grell/lib/grell/crawler.rb +83 -0
- package/grell/lib/grell/crawler_manager.rb +84 -0
- package/grell/lib/grell/grell_logger.rb +10 -0
- package/grell/lib/grell/page.rb +275 -0
- package/grell/lib/grell/page_collection.rb +62 -0
- package/grell/lib/grell/rawpage.rb +62 -0
- package/grell/lib/grell/reader.rb +18 -0
- package/grell/lib/grell/version.rb +3 -0
- package/grell/lib/grell.rb +11 -0
- package/grell/spec/lib/capybara_driver_spec.rb +38 -0
- package/grell/spec/lib/crawler_manager_spec.rb +174 -0
- package/grell/spec/lib/crawler_spec.rb +361 -0
- package/grell/spec/lib/page_collection_spec.rb +159 -0
- package/grell/spec/lib/page_spec.rb +418 -0
- package/grell/spec/lib/reader_spec.rb +43 -0
- package/grell/spec/spec_helper.rb +66 -0
- package/heartmagic/config.py +1 -0
- package/heartmagic/heart.py +3 -0
- package/heartmagic/pytransform/__init__.py +483 -0
- package/heartmagic/pytransform/_pytransform.dll +0 -0
- package/heartmagic/pytransform/_pytransform.so +0 -0
- package/httpStatusCode/README.md +2 -0
- package/httpStatusCode/httpStatusCode.js +4 -0
- package/httpStatusCode/reasonPhrases.js +344 -0
- package/httpStatusCode/statusCodes.js +344 -0
- package/package.json +1 -1
- package/rubyretriever/.rspec +2 -0
- package/rubyretriever/.travis.yml +7 -0
- package/rubyretriever/Gemfile +3 -0
- package/rubyretriever/Gemfile.lock +64 -0
- package/rubyretriever/LICENSE +20 -0
- package/rubyretriever/Rakefile +7 -0
- package/rubyretriever/bin/rr +79 -0
- package/rubyretriever/lib/retriever/cli.rb +25 -0
- package/rubyretriever/lib/retriever/core_ext.rb +13 -0
- package/rubyretriever/lib/retriever/fetch.rb +268 -0
- package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
- package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
- package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
- package/rubyretriever/lib/retriever/link.rb +47 -0
- package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
- package/rubyretriever/lib/retriever/page.rb +104 -0
- package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
- package/rubyretriever/lib/retriever/target.rb +47 -0
- package/rubyretriever/lib/retriever/version.rb +4 -0
- package/rubyretriever/lib/retriever.rb +15 -0
- package/rubyretriever/readme.md +166 -0
- package/rubyretriever/rubyretriever.gemspec +41 -0
- package/rubyretriever/spec/link_spec.rb +77 -0
- package/rubyretriever/spec/page_spec.rb +94 -0
- package/rubyretriever/spec/retriever_spec.rb +84 -0
- package/rubyretriever/spec/spec_helper.rb +17 -0
- package/rubyretriever/spec/target_spec.rb +55 -0
- package/snapcrawl/.changelog.old.md +157 -0
- package/snapcrawl/.gitattributes +1 -0
- package/snapcrawl/.github/workflows/test.yml +41 -0
- package/snapcrawl/.rspec +3 -0
- package/snapcrawl/.rubocop.yml +23 -0
- package/snapcrawl/CHANGELOG.md +182 -0
- package/snapcrawl/Gemfile +15 -0
- package/snapcrawl/LICENSE +21 -0
- package/snapcrawl/README.md +135 -0
- package/snapcrawl/Runfile +35 -0
- package/snapcrawl/bin/snapcrawl +25 -0
- package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
- package/snapcrawl/lib/snapcrawl/config.rb +60 -0
- package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
- package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
- package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
- package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
- package/snapcrawl/lib/snapcrawl/page.rb +118 -0
- package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
- package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
- package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
- package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
- package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
- package/snapcrawl/lib/snapcrawl/version.rb +3 -0
- package/snapcrawl/lib/snapcrawl.rb +20 -0
- package/snapcrawl/snapcrawl.gemspec +27 -0
- package/snapcrawl/snapcrawl.yml +41 -0
- package/snapcrawl/spec/README.md +16 -0
- package/snapcrawl/spec/approvals/bin/help +26 -0
- package/snapcrawl/spec/approvals/bin/usage +4 -0
- package/snapcrawl/spec/approvals/cli/usage +4 -0
- package/snapcrawl/spec/approvals/config/defaults +15 -0
- package/snapcrawl/spec/approvals/config/minimal +15 -0
- package/snapcrawl/spec/approvals/integration/blacklist +14 -0
- package/snapcrawl/spec/approvals/integration/default-config +14 -0
- package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
- package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
- package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
- package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
- package/snapcrawl/spec/approvals/integration/whitelist +14 -0
- package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
- package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
- package/snapcrawl/spec/server/config.ru +97 -0
- package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
- package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
- package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
- package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
- package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
- package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
- package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
- package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
- package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
- package/snapcrawl/spec/spec_helper.rb +22 -0
- package/snapcrawl/spec/spec_mixin.rb +10 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
defmodule CrawlerTest do
|
|
2
|
+
use Crawler.TestCase, async: false
|
|
3
|
+
|
|
4
|
+
alias Crawler.Store
|
|
5
|
+
|
|
6
|
+
@moduletag capture_log: true
|
|
7
|
+
|
|
8
|
+
doctest Crawler
|
|
9
|
+
|
|
10
|
+
test ".crawl", %{bypass: bypass, url: url} do
|
|
11
|
+
Store.ops_reset()
|
|
12
|
+
|
|
13
|
+
url = "#{url}/crawler"
|
|
14
|
+
linked_url1 = "#{url}/link1"
|
|
15
|
+
linked_url2 = "#{url}/link2"
|
|
16
|
+
linked_url3 = "#{url}/link3"
|
|
17
|
+
linked_url4 = "#{url}/link4"
|
|
18
|
+
|
|
19
|
+
Bypass.expect_once(bypass, "GET", "/crawler", fn conn ->
|
|
20
|
+
Plug.Conn.resp(conn, 200, """
|
|
21
|
+
<html><a href="#{linked_url1}">1</a></html>
|
|
22
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
23
|
+
""")
|
|
24
|
+
end)
|
|
25
|
+
|
|
26
|
+
Bypass.expect_once(bypass, "GET", "/crawler/link1", fn conn ->
|
|
27
|
+
Plug.Conn.resp(conn, 200, """
|
|
28
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
29
|
+
""")
|
|
30
|
+
end)
|
|
31
|
+
|
|
32
|
+
Bypass.expect_once(bypass, "GET", "/crawler/link2", fn conn ->
|
|
33
|
+
Plug.Conn.resp(conn, 200, """
|
|
34
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
35
|
+
""")
|
|
36
|
+
end)
|
|
37
|
+
|
|
38
|
+
Bypass.expect_once(bypass, "GET", "/crawler/link3", fn conn ->
|
|
39
|
+
Plug.Conn.resp(conn, 200, """
|
|
40
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
41
|
+
""")
|
|
42
|
+
end)
|
|
43
|
+
|
|
44
|
+
{:ok, opts} = Crawler.crawl(url, max_depths: 3, workers: 3, interval: 100, store: Store)
|
|
45
|
+
|
|
46
|
+
assert Crawler.running?(opts)
|
|
47
|
+
|
|
48
|
+
Crawler.pause(opts)
|
|
49
|
+
|
|
50
|
+
refute Crawler.running?(opts)
|
|
51
|
+
|
|
52
|
+
assert opts[:workers] == 3
|
|
53
|
+
|
|
54
|
+
Crawler.resume(opts)
|
|
55
|
+
|
|
56
|
+
assert Crawler.running?(opts)
|
|
57
|
+
|
|
58
|
+
wait(fn ->
|
|
59
|
+
assert Store.ops_count() == 4
|
|
60
|
+
end)
|
|
61
|
+
|
|
62
|
+
wait(fn ->
|
|
63
|
+
assert %Store.Page{url: ^url, opts: %{workers: 3}} = Store.find_processed({url, nil})
|
|
64
|
+
|
|
65
|
+
assert Store.find_processed({linked_url1, nil})
|
|
66
|
+
assert Store.find_processed({linked_url2, nil})
|
|
67
|
+
assert Store.find_processed({linked_url3, nil})
|
|
68
|
+
refute Store.find({linked_url4, nil})
|
|
69
|
+
|
|
70
|
+
urls = Crawler.Store.all_urls()
|
|
71
|
+
|
|
72
|
+
assert Enum.member?(urls, {url, nil})
|
|
73
|
+
assert Enum.member?(urls, {linked_url1, nil})
|
|
74
|
+
assert Enum.member?(urls, {linked_url2, nil})
|
|
75
|
+
assert Enum.member?(urls, {linked_url3, nil})
|
|
76
|
+
refute Enum.member?(urls, {linked_url4, nil})
|
|
77
|
+
end)
|
|
78
|
+
|
|
79
|
+
wait(fn ->
|
|
80
|
+
refute Crawler.running?(opts)
|
|
81
|
+
assert OPQ.info(opts[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 3}
|
|
82
|
+
end)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
test ".crawl without a store", %{bypass: bypass, url: url} do
|
|
86
|
+
url = "#{url}/crawler_without_store"
|
|
87
|
+
|
|
88
|
+
Bypass.expect_once(bypass, "GET", "/crawler_without_store", fn conn ->
|
|
89
|
+
Plug.Conn.resp(conn, 200, "200")
|
|
90
|
+
end)
|
|
91
|
+
|
|
92
|
+
{:ok, opts} = Crawler.crawl(url, max_depths: 1, workers: 1, interval: 100, store: nil)
|
|
93
|
+
|
|
94
|
+
wait(fn ->
|
|
95
|
+
assert %Store.Page{url: ^url, body: nil, opts: nil} = Store.find_processed({url, nil})
|
|
96
|
+
end)
|
|
97
|
+
|
|
98
|
+
wait(fn ->
|
|
99
|
+
assert OPQ.info(opts[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 1}
|
|
100
|
+
end)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
test ".crawl with max_pages", %{bypass: bypass, url: url} do
|
|
104
|
+
Store.ops_reset()
|
|
105
|
+
|
|
106
|
+
url = "#{url}/crawler_with_max_pages"
|
|
107
|
+
linked_url1 = "#{url}/link1"
|
|
108
|
+
linked_url2 = "#{url}/link2"
|
|
109
|
+
linked_url3 = "#{url}/link3"
|
|
110
|
+
linked_url4 = "#{url}/link4"
|
|
111
|
+
linked_url5 = "#{url}/link5"
|
|
112
|
+
|
|
113
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_max_pages", fn conn ->
|
|
114
|
+
Plug.Conn.resp(conn, 200, """
|
|
115
|
+
<html><a href="#{linked_url1}">1</a></html>
|
|
116
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
117
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
118
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
119
|
+
<html><a href="#{linked_url5}">5</a></html>
|
|
120
|
+
""")
|
|
121
|
+
end)
|
|
122
|
+
|
|
123
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_max_pages/link1", fn conn ->
|
|
124
|
+
Plug.Conn.resp(conn, 200, """
|
|
125
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
126
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
127
|
+
""")
|
|
128
|
+
end)
|
|
129
|
+
|
|
130
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_max_pages/link2", fn conn ->
|
|
131
|
+
Plug.Conn.resp(conn, 200, """
|
|
132
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
133
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
134
|
+
<html><a href="#{linked_url5}">5</a></html>
|
|
135
|
+
""")
|
|
136
|
+
end)
|
|
137
|
+
|
|
138
|
+
Bypass.stub(bypass, "GET", "/crawler_with_max_pages/link3", fn conn ->
|
|
139
|
+
Plug.Conn.resp(conn, 200, """
|
|
140
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
141
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
142
|
+
<html><a href="#{linked_url5}">5</a></html>
|
|
143
|
+
""")
|
|
144
|
+
end)
|
|
145
|
+
|
|
146
|
+
Bypass.stub(bypass, "GET", "/crawler_with_max_pages/link4", fn conn ->
|
|
147
|
+
Plug.Conn.resp(conn, 200, """
|
|
148
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
149
|
+
<html><a href="#{linked_url4}">4</a></html>
|
|
150
|
+
<html><a href="#{linked_url5}">5</a></html>
|
|
151
|
+
""")
|
|
152
|
+
end)
|
|
153
|
+
|
|
154
|
+
{:ok, opts} = Crawler.crawl(url, max_depths: 3, force: true, workers: 4, max_pages: 3, interval: 100)
|
|
155
|
+
|
|
156
|
+
wait(fn ->
|
|
157
|
+
assert Store.ops_count() == 4
|
|
158
|
+
end)
|
|
159
|
+
|
|
160
|
+
wait(fn ->
|
|
161
|
+
assert Store.find_processed({url, opts[:scope]})
|
|
162
|
+
assert Store.find_processed({linked_url1, opts[:scope]})
|
|
163
|
+
assert Store.find_processed({linked_url2, opts[:scope]})
|
|
164
|
+
assert Store.find_processed({linked_url3, opts[:scope]})
|
|
165
|
+
refute Store.find({linked_url4, opts[:scope]})
|
|
166
|
+
refute Store.find({linked_url5, opts[:scope]})
|
|
167
|
+
end)
|
|
168
|
+
|
|
169
|
+
wait(fn ->
|
|
170
|
+
assert OPQ.info(opts[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 4}
|
|
171
|
+
end)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
test ".crawl with an existing queue", %{bypass: bypass, url: url} do
|
|
175
|
+
Store.ops_reset()
|
|
176
|
+
|
|
177
|
+
url = "#{url}/crawler_with_queue"
|
|
178
|
+
linked_url1 = "#{url}/link1"
|
|
179
|
+
linked_url2 = "#{url}/link2"
|
|
180
|
+
linked_url3 = "#{url}/link3"
|
|
181
|
+
linked_url4 = "#{url}/link4"
|
|
182
|
+
|
|
183
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_queue/link1", fn conn ->
|
|
184
|
+
Plug.Conn.resp(conn, 200, """
|
|
185
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
186
|
+
""")
|
|
187
|
+
end)
|
|
188
|
+
|
|
189
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_queue/link2", fn conn ->
|
|
190
|
+
Plug.Conn.resp(conn, 200, """
|
|
191
|
+
<html><a href="#{linked_url3}">3</a></html>
|
|
192
|
+
""")
|
|
193
|
+
end)
|
|
194
|
+
|
|
195
|
+
Bypass.expect_once(bypass, "GET", "/crawler_with_queue/link3", fn conn ->
|
|
196
|
+
Plug.Conn.resp(conn, 200, """
|
|
197
|
+
<html>ok</html>
|
|
198
|
+
""")
|
|
199
|
+
end)
|
|
200
|
+
|
|
201
|
+
{:ok, queue} = OPQ.init(worker: Crawler.Dispatcher.Worker, workers: 2, interval: 100)
|
|
202
|
+
|
|
203
|
+
{:ok, opts1} = Crawler.crawl(linked_url1, store: Store, queue: queue)
|
|
204
|
+
{:ok, opts2} = Crawler.crawl(linked_url2, store: Store, queue: queue)
|
|
205
|
+
|
|
206
|
+
wait(fn ->
|
|
207
|
+
assert Store.ops_count() == 3
|
|
208
|
+
end)
|
|
209
|
+
|
|
210
|
+
wait(fn ->
|
|
211
|
+
assert Store.find_processed({linked_url1, nil})
|
|
212
|
+
assert Store.find_processed({linked_url2, nil})
|
|
213
|
+
assert Store.find_processed({linked_url3, nil})
|
|
214
|
+
refute Store.find_processed({linked_url4, nil})
|
|
215
|
+
|
|
216
|
+
urls = Crawler.Store.all_urls()
|
|
217
|
+
|
|
218
|
+
assert Enum.member?(urls, {linked_url1, nil})
|
|
219
|
+
assert Enum.member?(urls, {linked_url2, nil})
|
|
220
|
+
assert Enum.member?(urls, {linked_url3, nil})
|
|
221
|
+
refute Enum.member?(urls, {linked_url4, nil})
|
|
222
|
+
end)
|
|
223
|
+
|
|
224
|
+
wait(fn ->
|
|
225
|
+
assert OPQ.info(opts1[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
|
|
226
|
+
assert OPQ.info(opts2[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
|
|
227
|
+
end)
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
test ".crawl forced", %{bypass: bypass, url: url} do
|
|
231
|
+
Store.ops_reset()
|
|
232
|
+
|
|
233
|
+
url = "#{url}/crawler_forced"
|
|
234
|
+
linked_url1 = "#{url}/link1"
|
|
235
|
+
linked_url2 = "#{url}/link2"
|
|
236
|
+
|
|
237
|
+
Bypass.expect(bypass, "GET", "/crawler_forced", fn conn ->
|
|
238
|
+
Plug.Conn.resp(conn, 200, """
|
|
239
|
+
<html><a href="#{linked_url1}">1</a></html>
|
|
240
|
+
<html><a href="#{linked_url1}">1</a></html>
|
|
241
|
+
""")
|
|
242
|
+
end)
|
|
243
|
+
|
|
244
|
+
Bypass.expect(bypass, "GET", "/crawler_forced/link1", fn conn ->
|
|
245
|
+
Plug.Conn.resp(conn, 200, """
|
|
246
|
+
<html><a href="#{linked_url2}">2</a></html>
|
|
247
|
+
""")
|
|
248
|
+
end)
|
|
249
|
+
|
|
250
|
+
Bypass.expect(bypass, "GET", "/crawler_forced/link2", fn conn ->
|
|
251
|
+
Plug.Conn.resp(conn, 200, """
|
|
252
|
+
<html>ok</html>
|
|
253
|
+
""")
|
|
254
|
+
end)
|
|
255
|
+
|
|
256
|
+
{:ok, opts1} = Crawler.crawl(url, force: true, workers: 1, interval: 100)
|
|
257
|
+
{:ok, opts2} = Crawler.crawl(url, force: true, workers: 2, interval: 100)
|
|
258
|
+
|
|
259
|
+
refute opts1[:scope] == opts2[:scope]
|
|
260
|
+
|
|
261
|
+
wait(fn ->
|
|
262
|
+
assert Store.find_processed({url, opts1[:scope]})
|
|
263
|
+
assert Store.find_processed({url, opts2[:scope]})
|
|
264
|
+
assert Store.find_processed({linked_url1, opts1[:scope]})
|
|
265
|
+
assert Store.find_processed({linked_url1, opts2[:scope]})
|
|
266
|
+
assert Store.find_processed({linked_url2, opts1[:scope]})
|
|
267
|
+
assert Store.find_processed({linked_url2, opts2[:scope]})
|
|
268
|
+
|
|
269
|
+
assert Store.ops_count() >= 6
|
|
270
|
+
assert Store.ops_count() <= 10
|
|
271
|
+
|
|
272
|
+
assert OPQ.info(opts1[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 1}
|
|
273
|
+
assert OPQ.info(opts2[:queue]) == {:normal, %OPQ.Queue{data: {[], []}}, 2}
|
|
274
|
+
end)
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
test ".crawl stopped", %{bypass: bypass, url: url} do
|
|
278
|
+
url = "#{url}/stop"
|
|
279
|
+
linked_url = "#{url}/stop1"
|
|
280
|
+
|
|
281
|
+
Bypass.expect_once(bypass, "GET", "/stop", fn conn ->
|
|
282
|
+
Plug.Conn.resp(conn, 200, """
|
|
283
|
+
<html><a href="#{linked_url}">1</a></html>
|
|
284
|
+
""")
|
|
285
|
+
end)
|
|
286
|
+
|
|
287
|
+
{:ok, opts} = Crawler.crawl(url, workers: 1, interval: 500)
|
|
288
|
+
|
|
289
|
+
Process.sleep(200)
|
|
290
|
+
|
|
291
|
+
Crawler.stop(opts)
|
|
292
|
+
|
|
293
|
+
refute Store.find({linked_url, nil})
|
|
294
|
+
end
|
|
295
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
defmodule Crawler.TestCase do
|
|
2
|
+
use ExUnit.CaseTemplate
|
|
3
|
+
|
|
4
|
+
using do
|
|
5
|
+
quote do
|
|
6
|
+
import Crawler.TestHelpers
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
setup_all do
|
|
11
|
+
bypass = Bypass.open()
|
|
12
|
+
url = "http://localhost:#{bypass.port}"
|
|
13
|
+
path = "localhost-#{bypass.port}"
|
|
14
|
+
|
|
15
|
+
bypass2 = Bypass.open()
|
|
16
|
+
url2 = "http://localhost:#{bypass2.port}"
|
|
17
|
+
path2 = "localhost-#{bypass2.port}"
|
|
18
|
+
|
|
19
|
+
{
|
|
20
|
+
:ok,
|
|
21
|
+
bypass: bypass, url: url, path: path, bypass2: bypass2, url2: url2, path2: path2
|
|
22
|
+
}
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Credit: https://gist.github.com/cblavier/5e15791387a6e22b98d8
|
|
2
|
+
defmodule Crawler.TestHelpers do
|
|
3
|
+
def wait(fun), do: wait(500, fun)
|
|
4
|
+
def wait(0, fun), do: fun.()
|
|
5
|
+
|
|
6
|
+
def wait(timeout, fun) do
|
|
7
|
+
try do
|
|
8
|
+
fun.()
|
|
9
|
+
rescue
|
|
10
|
+
_ ->
|
|
11
|
+
:timer.sleep(10)
|
|
12
|
+
wait(max(0, timeout - 10), fun)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def tmp(path \\ "", filename \\ "") do
|
|
17
|
+
tmp_path = Path.join([File.cwd!(), "test", "tmp", path])
|
|
18
|
+
|
|
19
|
+
File.mkdir_p(tmp_path)
|
|
20
|
+
|
|
21
|
+
Path.join(tmp_path, filename)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def image_file do
|
|
25
|
+
{:ok, file} = File.read("test/fixtures/introducing-elixir.jpg")
|
|
26
|
+
file
|
|
27
|
+
end
|
|
28
|
+
end
|
package/grell/.rspec
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
language: ruby
|
|
2
|
+
cache: bundler
|
|
3
|
+
|
|
4
|
+
rvm:
|
|
5
|
+
- 2.2.4
|
|
6
|
+
- 2.3.0
|
|
7
|
+
- 2.4.2
|
|
8
|
+
|
|
9
|
+
before_install:
|
|
10
|
+
- mkdir travis-phantomjs
|
|
11
|
+
- wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true
|
|
12
|
+
-O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
|
|
13
|
+
- tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
|
|
14
|
+
- export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
|
|
15
|
+
|
|
16
|
+
install:
|
|
17
|
+
- bundle install --jobs=3 --retry=3
|
|
18
|
+
|
|
19
|
+
script:
|
|
20
|
+
- bundle exec rspec
|
|
21
|
+
|
|
22
|
+
deploy:
|
|
23
|
+
provider: rubygems
|
|
24
|
+
api_key:
|
|
25
|
+
secure: czStDI0W6MWL70sDwu53oNNCc8vKtT61pgvii+ZWIC9A41C2p7BzmbtosXsnLk2ApxmpWvFIgtQE0XIH7jkM5mY05cHinXDphtOTkNLFVjck3ZOMkx/cc+QRFW8K4FHkrzFsC+/Xx4t2/Psh35LpzhfJd0XzKKoCstXUVgJsfGcAK3DMpjXHSUbwLXGDZ4lzmsk52OLf0oL+in2447TJfVOvGXtYmfh1PjXRwDxKB0dan7w5mVgajS52b6wUhVPTaMe/JgCbMuV7BaQ1Goq8u7V4aaxU+liPAhzHWfMB6tF4TEW8yu2tvGLdOA0+1jmM8E9Q5saPWtwKiHvBxN8CzRpkiNDzyFAf8ljrWT5yKX3aRQCyPp3NNyhoumWap36b+O/zwZ3HxoAe22Yg0rjz8z8NxMR/ELPvjPYjCiF5zY7fO9PAzmIynMRUrxDnFj+/JGHdzx0ZMo3fEXgHHSaHPNxIzEffVVQk4XLVnFHDjBLY4mVp4sbHbja5qnui20RkdM/H9Yi/fQyl1ODhk+LUPoh45ZneDZq7GPrl+WKK06oEjXIXLU+1iEuqnSqybbmJMTUJlUV+7EJdtq2DgfDB4KXwLm2LLOR/IX63AzEav4NIxx3hIXifSKa9rp6D7nMTzdQwF0FFzIj/Y3qLrAe1WWt0gx3Vxq67pSwOJthk5Fc=
|
|
26
|
+
on:
|
|
27
|
+
tags: true
|
|
28
|
+
rvm: 2.4.2
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# 2.1.2
|
|
2
|
+
* Change white/black lists to allow/deny lists
|
|
3
|
+
|
|
4
|
+
# 2.1.1
|
|
5
|
+
* Update phantomjs_options to use 'TLSv1.2'
|
|
6
|
+
|
|
7
|
+
# 2.1.0
|
|
8
|
+
* Delete `driver_options` configuration key as it was never used.
|
|
9
|
+
* `cleanup_all_processes` is a self method as intended to.
|
|
10
|
+
|
|
11
|
+
# 2.0.0
|
|
12
|
+
* New configuration key `on_periodic_restart`.
|
|
13
|
+
* CrawlerManager.cleanup_all_processes method destroy all instances of phantomjs in this machine.
|
|
14
|
+
|
|
15
|
+
* Breaking changes
|
|
16
|
+
- Requires Ruby 2.1 or later.
|
|
17
|
+
- Crawler.start_crawling does not accept options anymore, all options are passed to Crawler.new.
|
|
18
|
+
- Crawler's methods `restart` and `quit` have been moved to CrawlerManager.
|
|
19
|
+
- Crawler gets whitelist and blacklist as configuration options instead of being set in specific methods.
|
|
20
|
+
|
|
21
|
+
# 1.6.11
|
|
22
|
+
* Ensure all links are loaded by waiting for Ajax requests to complete
|
|
23
|
+
* Add '@evaluate_in_each_page' option to evaluate before extracting links (e.g. $('.dropdown').addClass('open');)
|
|
24
|
+
|
|
25
|
+
# 1.6.10
|
|
26
|
+
* Avoid following JS href links, add missing dependencies to fix Travis build
|
|
27
|
+
|
|
28
|
+
# 1.6.9
|
|
29
|
+
* Avoid following links when disabled by CSS (1.6.8 worked only for Javascript)
|
|
30
|
+
|
|
31
|
+
# 1.6.8
|
|
32
|
+
* Avoid following disabled links
|
|
33
|
+
|
|
34
|
+
# 1.6.7
|
|
35
|
+
* Increment '@times_visited' first to avoid infinite retries when rescuing errors
|
|
36
|
+
|
|
37
|
+
# 1.6.6
|
|
38
|
+
* Updated phantomjs_logger not to open '/dev/null'
|
|
39
|
+
|
|
40
|
+
# 1.6.5
|
|
41
|
+
* Added #quit to Crawler
|
|
42
|
+
|
|
43
|
+
# 1.6.4
|
|
44
|
+
* Added #quit to Capybara driver
|
|
45
|
+
|
|
46
|
+
# 1.6.3
|
|
47
|
+
* Only follow visible links
|
|
48
|
+
|
|
49
|
+
# 1.6.2
|
|
50
|
+
* Reset Capybara driver to Puffing Billy (used to rewrite URL requests in specs)
|
|
51
|
+
* Use float timestamp for Poltergeist driver name to support fast test executions
|
|
52
|
+
|
|
53
|
+
# 1.6.1
|
|
54
|
+
* Use non-static name to support registering Poltergeist crawler multiple times
|
|
55
|
+
* More exception handling, store redirected URLs in addition to original URL
|
|
56
|
+
|
|
57
|
+
# 1.6
|
|
58
|
+
* Support custom URL comparison when adding new pages during crawling
|
|
59
|
+
* Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
|
|
60
|
+
* Fail early if Capybara doesn't initialize properly
|
|
61
|
+
|
|
62
|
+
# 1.5.1
|
|
63
|
+
* Fixed deprecation warning (Thanks scott)
|
|
64
|
+
* Updated Poltergeist dependency
|
|
65
|
+
|
|
66
|
+
# 1.5.0
|
|
67
|
+
* Grell will follow redirects.
|
|
68
|
+
* Added #followed_redirects? #error? #current_url methods to the Page class
|
|
69
|
+
|
|
70
|
+
# 1.4.0
|
|
71
|
+
* Added crawler.restart to restart browser process
|
|
72
|
+
* The block of code can make grell retry any given page.
|
|
73
|
+
|
|
74
|
+
# 1.3.2
|
|
75
|
+
* Rescue Timeout error and return an empty page when that happens
|
|
76
|
+
|
|
77
|
+
# 1.3.1
|
|
78
|
+
* Added whitelisting and blacklisting
|
|
79
|
+
* Better info in gemspec
|
|
80
|
+
|
|
81
|
+
# 1.3
|
|
82
|
+
* The Crawler object allows you to provide an external logger object.
|
|
83
|
+
* Clearer semantics when an error happens, special headers are returned so the user can inspect the error
|
|
84
|
+
* Caveats:
|
|
85
|
+
- The 'debug' option in the crawler does not have any affect anymore. Provide an external logger with 'logger' instead
|
|
86
|
+
- The errors provided in the headers by grell has changed from 'grell_status' to 'grellStatus'.
|
|
87
|
+
- The 'visited' property in the page was never supposed to be accesible. Use 'visited?' instead.
|
|
88
|
+
|
|
89
|
+
# 1.2.1
|
|
90
|
+
* Solve bug: URLs are case insensitive
|
|
91
|
+
|
|
92
|
+
# 1.2
|
|
93
|
+
* Grell now will consider two links to point to the same page only when the whole URL is exactly the same.
|
|
94
|
+
Versions previously would only consider two links to be the same when they shared the path.
|
|
95
|
+
|
|
96
|
+
# 1.1.2
|
|
97
|
+
* Solve bug where we were adding links in heads as if there were normal links in the body
|
|
98
|
+
|
|
99
|
+
# 1.1.1
|
|
100
|
+
* Solve bug with the new data-href functionality
|
|
101
|
+
|
|
102
|
+
# 1.1
|
|
103
|
+
* Solve problem with randomly failing spec
|
|
104
|
+
* Search for elements with 'href' or 'data-href' to find links
|
|
105
|
+
|
|
106
|
+
# 1.0.1
|
|
107
|
+
* Rescueing Javascript errors
|
|
108
|
+
|
|
109
|
+
# 1.0
|
|
110
|
+
* Initial implementation
|
|
111
|
+
* Basic support to crawling pages.
|
package/grell/Gemfile
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Copyright (c) 2015 Medidata Solutions Worldwide
|
|
2
|
+
|
|
3
|
+
MIT License
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
a copy of this software and associated documentation files (the
|
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
11
|
+
the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be
|
|
14
|
+
included in all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|