powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,71 @@
1
+ defmodule Crawler.Fetcher.PolicerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Fetcher.Policer
5
+ alias Crawler.Fetcher.UrlFilter
6
+ alias Crawler.Store
7
+
8
+ @moduletag capture_log: true
9
+
10
+ doctest Policer
11
+
12
+ setup do
13
+ Store.ops_reset()
14
+
15
+ :ok
16
+ end
17
+
18
+ test "max_pages ok" do
19
+ Store.ops_inc()
20
+ Store.ops_inc()
21
+
22
+ assert {:ok, %{max_pages: :infinity}} = Policer.police(%{max_pages: :infinity})
23
+ end
24
+
25
+ test "max_pages error" do
26
+ Store.ops_inc()
27
+ Store.ops_inc()
28
+
29
+ assert {:warn, "Fetch failed check 'within_max_pages?', with opts: " <> _} =
30
+ Policer.police(%{max_pages: 1})
31
+ end
32
+
33
+ test "max_depths ok" do
34
+ assert {:ok, %{depth: 1, max_depths: 2}} = Policer.police(%{depth: 1, max_depths: 2})
35
+ end
36
+
37
+ test "max_depths error" do
38
+ assert {:warn, "Fetch failed check 'within_fetch_depth?', with opts: " <> _} =
39
+ Policer.police(%{
40
+ depth: 2,
41
+ max_depths: 2,
42
+ html_tag: "a"
43
+ })
44
+ end
45
+
46
+ test "uri_scheme ok" do
47
+ assert {:ok,
48
+ %{
49
+ html_tag: "img",
50
+ url: "http://policer/hi.jpg",
51
+ url_filter: UrlFilter
52
+ }} =
53
+ Policer.police(%{
54
+ html_tag: "img",
55
+ url: "http://policer/hi.jpg",
56
+ url_filter: UrlFilter
57
+ })
58
+ end
59
+
60
+ test "uri_scheme error" do
61
+ assert {:warn, "Fetch failed check 'acceptable_uri_scheme?', with opts: " <> _} =
62
+ Policer.police(%{url: "ftp://hello.world"})
63
+ end
64
+
65
+ test "fetched error" do
66
+ Crawler.Store.add({"http://policer/exist/", nil})
67
+
68
+ assert {:warn, "Fetch failed check 'not_fetched_yet?', with opts: " <> _} =
69
+ Policer.police(%{url: "http://policer/exist/", scope: nil})
70
+ end
71
+ end
@@ -0,0 +1,9 @@
1
+ defmodule Crawler.Fetcher.RecorderTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Fetcher.Recorder
5
+ alias Crawler.Store
6
+ alias Crawler.Store.Page
7
+
8
+ doctest Recorder
9
+ end
@@ -0,0 +1,9 @@
1
+ defmodule Crawler.Fetcher.RequesterTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Fetcher.Requester
5
+
6
+ @moduletag capture_log: true
7
+
8
+ doctest Requester
9
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Fetcher.RetrierTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Fetcher.Retrier
5
+
6
+ doctest Retrier
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Fetcher.UrlFilterTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Fetcher.UrlFilter
5
+
6
+ doctest UrlFilter
7
+ end
@@ -0,0 +1,153 @@
1
+ defmodule Crawler.FetcherTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Fetcher
5
+ alias Crawler.Fetcher.Modifier
6
+ alias Crawler.Fetcher.Retrier
7
+ alias Crawler.Fetcher.UrlFilter
8
+ alias Crawler.Store
9
+
10
+ @moduletag capture_log: true
11
+
12
+ doctest Fetcher
13
+
14
+ defmodule DummyRetrier do
15
+ @behaviour Retrier.Spec
16
+
17
+ def perform(fetch_url, _opts), do: fetch_url.()
18
+ end
19
+
20
+ @defaults %{
21
+ depth: 0,
22
+ retries: 2,
23
+ url_filter: UrlFilter,
24
+ modifier: Modifier,
25
+ retrier: DummyRetrier,
26
+ store: Store,
27
+ html_tag: "a"
28
+ }
29
+
30
+ test "success", %{bypass: bypass, url: url} do
31
+ url = "#{url}/fetcher/200"
32
+
33
+ Bypass.expect_once(bypass, "GET", "/fetcher/200", fn conn ->
34
+ Plug.Conn.resp(conn, 200, "<html>200</html>")
35
+ end)
36
+
37
+ @defaults
38
+ |> Map.merge(%{url: url})
39
+ |> Fetcher.fetch()
40
+
41
+ page = Store.find({url, nil})
42
+
43
+ assert page.url == url
44
+ assert page.body == "<html>200</html>"
45
+ end
46
+
47
+ test "success: 301", %{bypass: bypass, url: url} do
48
+ Bypass.expect_once(bypass, "GET", "/fetcher/301", fn conn ->
49
+ conn
50
+ |> Plug.Conn.merge_resp_headers([{"location", "#{url}/fetcher/301_200"}])
51
+ |> Plug.Conn.resp(301, "")
52
+ end)
53
+
54
+ Bypass.expect_once(bypass, "GET", "/fetcher/301_200", fn conn ->
55
+ Plug.Conn.resp(conn, 200, "<html>301_200</html>")
56
+ end)
57
+
58
+ url = "#{url}/fetcher/301"
59
+
60
+ @defaults
61
+ |> Map.merge(%{url: url})
62
+ |> Fetcher.fetch()
63
+
64
+ page = Store.find({url, nil})
65
+
66
+ assert page.url == url
67
+ assert page.body == "<html>301_200</html>"
68
+ end
69
+
70
+ test "failure: 500", %{bypass: bypass, url: url} do
71
+ url = "#{url}/fetcher/500"
72
+
73
+ Bypass.expect_once(bypass, "GET", "/fetcher/500", fn conn ->
74
+ Plug.Conn.resp(conn, 500, "<html>500</html>")
75
+ end)
76
+
77
+ fetcher =
78
+ @defaults
79
+ |> Map.merge(%{url: url})
80
+ |> Fetcher.fetch()
81
+
82
+ assert fetcher == {:warn, "Failed to fetch #{url}, status code: 500"}
83
+ refute Store.find({url, nil}).body
84
+ end
85
+
86
+ test "failure: timeout", %{bypass: bypass, url: url} do
87
+ url = "#{url}/fetcher/timeout"
88
+
89
+ Bypass.expect_once(bypass, "GET", "/fetcher/timeout", fn conn ->
90
+ Process.flag(:trap_exit, true)
91
+ :timer.sleep(100)
92
+ Plug.Conn.resp(conn, 200, "<html>200</html>")
93
+ end)
94
+
95
+ fetcher =
96
+ @defaults
97
+ |> Map.merge(%{url: url, timeout: 50})
98
+ |> Fetcher.fetch()
99
+
100
+ assert fetcher == {:warn, "Failed to fetch #{url}, reason: :timeout"}
101
+ refute Store.find({url, nil}).body
102
+ end
103
+
104
+ test "failure: retries", %{bypass: bypass, url: url} do
105
+ url = "#{url}/fetcher/retries"
106
+
107
+ Bypass.expect(bypass, "GET", "/fetcher/retries", fn conn ->
108
+ Plug.Conn.resp(conn, 500, "<html>500</html>")
109
+ end)
110
+
111
+ wait(fn ->
112
+ fetcher =
113
+ @defaults
114
+ |> Map.merge(%{url: url, timeout: 100, retrier: Retrier})
115
+ |> Fetcher.fetch()
116
+
117
+ assert fetcher == {:warn, "Failed to fetch #{url}, status code: 500"}
118
+ refute Store.find({url, nil}).body
119
+ end)
120
+ end
121
+
122
+ test "failure: unable to write", %{bypass: bypass, url: url, path: path} do
123
+ url = "#{url}/fetcher/fail.html"
124
+
125
+ Bypass.expect_once(bypass, "GET", "/fetcher/fail.html", fn conn ->
126
+ Plug.Conn.resp(conn, 200, "<html>200</html>")
127
+ end)
128
+
129
+ fetcher =
130
+ @defaults
131
+ |> Map.merge(%{url: url, save_to: "nope"})
132
+ |> Fetcher.fetch()
133
+
134
+ assert {:error, "Cannot write to file nope/#{path}/fetcher/fail.html, reason: enoent"} ==
135
+ fetcher
136
+ end
137
+
138
+ test "snap /fetcher/page.html", %{bypass: bypass, url: url, path: path} do
139
+ url = "#{url}/fetcher/page.html"
140
+
141
+ Bypass.expect_once(bypass, "GET", "/fetcher/page.html", fn conn ->
142
+ Plug.Conn.resp(conn, 200, "<html>200</html>")
143
+ end)
144
+
145
+ @defaults
146
+ |> Map.merge(%{url: url, save_to: tmp("fetcher")})
147
+ |> Fetcher.fetch()
148
+
149
+ wait(fn ->
150
+ assert {:ok, "<html>200</html>"} == File.read(tmp("fetcher/#{path}/fetcher", "page.html"))
151
+ end)
152
+ end
153
+ end
@@ -0,0 +1,47 @@
1
+ defmodule Crawler.HTTPTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.HTTP
5
+
6
+ @moduletag capture_log: true
7
+
8
+ doctest HTTP
9
+
10
+ test "default user agent", %{bypass: bypass, url: url} do
11
+ Agent.start_link(fn -> "" end, name: HTTP.DefaultUA)
12
+
13
+ Bypass.expect_once(bypass, "GET", "/http/default_ua", fn conn ->
14
+ {_, ua} = Enum.find(conn.req_headers, fn {header, _} -> header == "user-agent" end)
15
+ Agent.update(HTTP.DefaultUA, fn _ -> ua end)
16
+
17
+ Plug.Conn.resp(conn, 200, "")
18
+ end)
19
+
20
+ Crawler.crawl("#{url}/http/default_ua")
21
+
22
+ wait(fn ->
23
+ assert String.match?(
24
+ Agent.get(HTTP.DefaultUA, & &1),
25
+ ~r{Crawler/\d\.\d\.\d \(https://github\.com/fredwu/crawler\)}
26
+ )
27
+ end)
28
+ end
29
+
30
+ test "custom user agent", %{bypass: bypass, url: url} do
31
+ Agent.start_link(fn -> "" end, name: HTTP.CustomUA)
32
+
33
+ Bypass.expect_once(bypass, "GET", "/http/custom_ua", fn conn ->
34
+ {_, ua} = Enum.find(conn.req_headers, fn {header, _} -> header == "user-agent" end)
35
+
36
+ Agent.update(HTTP.CustomUA, fn _ -> ua end)
37
+
38
+ Plug.Conn.resp(conn, 200, "")
39
+ end)
40
+
41
+ Crawler.crawl("#{url}/http/custom_ua", user_agent: "Hello World")
42
+
43
+ wait(fn ->
44
+ assert Agent.get(HTTP.CustomUA, & &1) == "Hello World"
45
+ end)
46
+ end
47
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Linker.PathBuilderTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Linker.PathBuilder
5
+
6
+ doctest PathBuilder
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Linker.PathExpanderTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Linker.PathExpander
5
+
6
+ doctest PathExpander
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Linker.PathFinderTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Linker.PathFinder
5
+
6
+ doctest PathFinder
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Linker.PathOfflinerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Linker.PathOffliner
5
+
6
+ doctest PathOffliner
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Linker.PathPrefixerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Linker.PathPrefixer
5
+
6
+ doctest PathPrefixer
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.LinkerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Linker
5
+
6
+ doctest Linker
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.OptionsTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Options
5
+
6
+ doctest Options
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Parser.CssParserTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Parser.CssParser
5
+
6
+ doctest CssParser
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Parser.GuarderTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Parser.Guarder
5
+
6
+ doctest Guarder
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Parser.HtmlParserTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Parser.HtmlParser
5
+
6
+ doctest HtmlParser
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Parser.LinkParser.LinkExpanderTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Parser.LinkParser.LinkExpander
5
+
6
+ doctest LinkExpander
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Parser.LinkParserTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Parser.LinkParser
5
+
6
+ doctest LinkParser
7
+ end
@@ -0,0 +1,8 @@
1
+ defmodule Crawler.ParserTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Parser
5
+ alias Crawler.Store.Page
6
+
7
+ doctest Parser
8
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.QueueHandlerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.QueueHandler
5
+
6
+ doctest QueueHandler
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.ScraperTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Scraper
5
+
6
+ doctest Scraper
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Snapper.DirMakerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Snapper.DirMaker
5
+
6
+ doctest DirMaker
7
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Snapper.LinkReplacerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Snapper.LinkReplacer
5
+
6
+ doctest LinkReplacer
7
+ end
@@ -0,0 +1,9 @@
1
+ defmodule Crawler.SnapperTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Snapper
5
+
6
+ @moduletag capture_log: true
7
+
8
+ doctest Snapper
9
+ end
@@ -0,0 +1,5 @@
1
+ defmodule Crawler.WorkerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ doctest Crawler.Worker
5
+ end