powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,82 @@
1
+ defmodule Crawler.Snapper do
2
+ @moduledoc """
3
+ Stores crawled pages offline.
4
+ """
5
+
6
+ require Logger
7
+
8
+ alias Crawler.Snapper.DirMaker
9
+ alias Crawler.Snapper.LinkReplacer
10
+
11
+ @doc """
12
+ In order to store pages offline, it provides the following functionalities:
13
+
14
+ - replaces all URLs to their equivalent relative paths
15
+ - creates directories when necessary to store the files
16
+
17
+ ## Examples
18
+
19
+ iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://hello-world.local"})
20
+ iex> File.read(tmp("snapper/hello-world.local", "index.html"))
21
+ {:ok, "hello"}
22
+
23
+ iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://snapper.local/index.html"})
24
+ iex> File.read(tmp("snapper/snapper.local", "index.html"))
25
+ {:ok, "hello"}
26
+
27
+ iex> Snapper.snap("hello", %{save_to: "nope", url: "http://snapper.local/index.html"})
28
+ {:error, "Cannot write to file nope/snapper.local/index.html, reason: enoent"}
29
+
30
+ iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://snapper.local/hello"})
31
+ iex> File.read(tmp("snapper/snapper.local/hello", "index.html"))
32
+ {:ok, "hello"}
33
+
34
+ iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://snapper.local/hello1/"})
35
+ iex> File.read(tmp("snapper/snapper.local/hello1", "index.html"))
36
+ {:ok, "hello"}
37
+
38
+ iex> Snapper.snap(
39
+ iex> "<a href='http://another.domain/page'></a>",
40
+ iex> %{
41
+ iex> save_to: tmp("snapper"),
42
+ iex> url: "http://snapper.local/depth0",
43
+ iex> depth: 1,
44
+ iex> max_depths: 2,
45
+ iex> html_tag: "a",
46
+ iex> content_type: "text/html",
47
+ iex> }
48
+ iex> )
49
+ iex> File.read(tmp("snapper/snapper.local/depth0", "index.html"))
50
+ {:ok, "<a href='../../another.domain/page/index.html'></a>"}
51
+
52
+ iex> Snapper.snap(
53
+ iex> "<a href='https://another.domain:8888/page'></a>",
54
+ iex> %{
55
+ iex> save_to: tmp("snapper"),
56
+ iex> url: "http://snapper.local:7777/dir/depth1",
57
+ iex> depth: 1,
58
+ iex> max_depths: 2,
59
+ iex> html_tag: "a",
60
+ iex> content_type: "text/html",
61
+ iex> }
62
+ iex> )
63
+ iex> File.read(tmp("snapper/snapper.local-7777/dir/depth1", "index.html"))
64
+ {:ok, "<a href='../../../another.domain-8888/page/index.html'></a>"}
65
+ """
66
+ def snap(body, opts) do
67
+ {:ok, body} = LinkReplacer.replace_links(body, opts)
68
+ file_path = DirMaker.make_dir(opts)
69
+
70
+ case File.write(file_path, body) do
71
+ :ok ->
72
+ {:ok, opts}
73
+
74
+ {:error, reason} ->
75
+ msg = "Cannot write to file #{file_path}, reason: #{reason}"
76
+
77
+ Logger.error(msg)
78
+
79
+ {:error, msg}
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,19 @@
1
+ defmodule Crawler.Store.Counter do
2
+ use Agent
3
+
4
+ def start_link(_args) do
5
+ Agent.start_link(fn -> 0 end, name: __MODULE__)
6
+ end
7
+
8
+ def value do
9
+ Agent.get(__MODULE__, & &1)
10
+ end
11
+
12
+ def inc do
13
+ Agent.update(__MODULE__, &(&1 + 1))
14
+ end
15
+
16
+ def reset do
17
+ Agent.update(__MODULE__, fn _ -> 0 end)
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Store.Page do
2
+ @moduledoc """
3
+ An internal struct for keeping the url and content of a crawled page.
4
+ """
5
+
6
+ defstruct [:url, :body, :opts, :processed]
7
+ end
@@ -0,0 +1,87 @@
1
+ defmodule Crawler.Store do
2
+ @moduledoc """
3
+ An internal data store for information related to each crawl.
4
+ """
5
+
6
+ alias Crawler.Store.Counter
7
+ alias Crawler.Store.DB
8
+ alias Crawler.Store.Page
9
+
10
+ use GenServer
11
+
12
+ def start_link(opts) do
13
+ children = [
14
+ {Registry, keys: :unique, name: DB},
15
+ Counter
16
+ ]
17
+
18
+ Supervisor.start_link(
19
+ children,
20
+ [strategy: :one_for_one, name: __MODULE__] ++ opts
21
+ )
22
+ end
23
+
24
+ @doc """
25
+ Initialises a new `Registry` named `Crawler.Store.DB`.
26
+ """
27
+ def init(args) do
28
+ {:ok, args}
29
+ end
30
+
31
+ @doc """
32
+ Finds a stored URL and returns its page data.
33
+ """
34
+ def find({url, scope}) do
35
+ case Registry.lookup(DB, {url, scope}) do
36
+ [{_, page}] -> page
37
+ _ -> nil
38
+ end
39
+ end
40
+
41
+ @doc """
42
+ Finds a stored URL and returns its page data only if it's processed.
43
+ """
44
+ def find_processed({url, scope}) do
45
+ case Registry.match(DB, {url, scope}, %{processed: true}) do
46
+ [{_, page}] -> page
47
+ _ -> nil
48
+ end
49
+ end
50
+
51
+ @doc """
52
+ Adds a URL to the registry.
53
+ """
54
+ def add({url, scope}) do
55
+ Registry.register(DB, {url, scope}, %Page{url: url})
56
+ end
57
+
58
+ @doc """
59
+ Adds the page data for a URL to the registry.
60
+ """
61
+ def add_page_data({url, scope}, body, opts) do
62
+ {_new, _old} = Registry.update_value(DB, {url, scope}, &%{&1 | body: body, opts: opts})
63
+ end
64
+
65
+ @doc """
66
+ Marks a URL as processed in the registry.
67
+ """
68
+ def processed({url, scope}) do
69
+ {_new, _old} = Registry.update_value(DB, {url, scope}, &%{&1 | processed: true})
70
+ end
71
+
72
+ def all_urls do
73
+ Registry.select(DB, [{{:"$1", :_, :_}, [], [:"$1"]}])
74
+ end
75
+
76
+ def ops_inc do
77
+ Counter.inc()
78
+ end
79
+
80
+ def ops_count do
81
+ Counter.value()
82
+ end
83
+
84
+ def ops_reset do
85
+ Counter.reset()
86
+ end
87
+ end
@@ -0,0 +1,62 @@
1
+ defmodule Crawler.Worker do
2
+ @moduledoc """
3
+ Handles the crawl tasks.
4
+ """
5
+
6
+ require Logger
7
+
8
+ alias Crawler.Fetcher
9
+ alias Crawler.Store
10
+ alias Crawler.Store.Page
11
+
12
+ use GenServer
13
+
14
+ def init(args) do
15
+ :timer.send_after(args[:timeout], :stop)
16
+
17
+ {:ok, args}
18
+ end
19
+
20
+ @doc """
21
+ Runs the worker that casts data to itself to kick off the crawl workflow.
22
+ """
23
+ def run(opts) do
24
+ Logger.debug("Running worker with opts: #{inspect(opts)}")
25
+
26
+ {:ok, pid} = GenServer.start_link(__MODULE__, opts, hibernate_after: 0)
27
+
28
+ GenServer.cast(pid, opts)
29
+ end
30
+
31
+ @doc """
32
+ A crawl workflow that delegates responsibilities to:
33
+
34
+ - `Crawler.Fetcher.fetch/1`
35
+ - `Crawler.Parser.parse/1` (or a custom parser)
36
+ """
37
+ def handle_cast(_req, state) do
38
+ Logger.debug("Running worker with opts: #{inspect(state)}")
39
+
40
+ state
41
+ |> Fetcher.fetch()
42
+ |> state[:parser].parse()
43
+ |> mark_processed()
44
+
45
+ {:noreply, state, :hibernate}
46
+ end
47
+
48
+ def handle_info(:stop, state) do
49
+ {:stop, :normal, state}
50
+ end
51
+
52
+ def handle_info(_msg, state) do
53
+ {:noreply, state}
54
+ end
55
+
56
+ defp mark_processed({:ok, %Page{url: url, opts: opts}}) do
57
+ Store.ops_inc()
58
+ Store.processed({url, opts[:scope]})
59
+ end
60
+
61
+ defp mark_processed(_), do: nil
62
+ end
@@ -0,0 +1,91 @@
1
+ defmodule Crawler do
2
+ @moduledoc """
3
+ A high performance web crawler in Elixir.
4
+ """
5
+
6
+ alias Crawler.Options
7
+ alias Crawler.QueueHandler
8
+ alias Crawler.Store
9
+ alias Crawler.Worker
10
+
11
+ use Application
12
+
13
+ @doc """
14
+ Crawler is an application that gets started automatically with:
15
+
16
+ - a `Crawler.Store` that initiates a `Registry` for keeping internal data
17
+ """
18
+ def start(_type, _args) do
19
+ children = [
20
+ Store,
21
+ {DynamicSupervisor, name: Crawler.QueueSupervisor, strategy: :one_for_one}
22
+ ]
23
+
24
+ Supervisor.start_link(children, strategy: :one_for_one, name: Crawler)
25
+ end
26
+
27
+ @doc """
28
+ Enqueues a crawl, via `Crawler.QueueHandler.enqueue/1`.
29
+
30
+ This is the default crawl behaviour as the queue determines when an actual
31
+ crawl should happen based on the available workers and the rate limit. The
32
+ queue kicks off `Crawler.Dispatcher.Worker` which in turn calls
33
+ `Crawler.crawl_now/1`.
34
+ """
35
+ def crawl(url, opts \\ []) do
36
+ opts =
37
+ opts
38
+ |> Enum.into(%{})
39
+ |> Options.assign_defaults()
40
+ |> Options.assign_scope()
41
+ |> Options.assign_url(url)
42
+ |> Options.perform_default_actions()
43
+
44
+ if Store.ops_count() < opts[:max_pages] do
45
+ QueueHandler.enqueue(opts)
46
+ end
47
+ end
48
+
49
+ @doc """
50
+ Stops the crawler.
51
+ """
52
+ def stop(opts) do
53
+ Process.flag(:trap_exit, true)
54
+ OPQ.stop(opts[:queue])
55
+ end
56
+
57
+ @doc """
58
+ Pauses the crawler.
59
+ """
60
+ def pause(opts), do: OPQ.pause(opts[:queue])
61
+
62
+ @doc """
63
+ Resumes the crawler after it was paused.
64
+ """
65
+ def resume(opts), do: OPQ.resume(opts[:queue])
66
+
67
+ @doc """
68
+ Checks whether the crawler is still crawling.
69
+ """
70
+ def running?(opts) do
71
+ Process.sleep(10)
72
+
73
+ cond do
74
+ opts[:queue] |> OPQ.info() |> elem(0) == :paused -> false
75
+ Store.ops_count() <= 1 -> true
76
+ OPQ.queue(opts[:queue]) |> Enum.any?() -> true
77
+ true -> false
78
+ end
79
+ end
80
+
81
+ @doc """
82
+ Crawls immediately, this is used by `Crawler.Dispatcher.Worker.start_link/1`.
83
+
84
+ For general purpose use cases, always use `Crawler.crawl/2` instead.
85
+ """
86
+ def crawl_now(opts) do
87
+ if Store.ops_count() < opts[:max_pages] do
88
+ Worker.run(opts)
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,78 @@
1
+ defmodule Crawler.Mixfile do
2
+ use Mix.Project
3
+
4
+ @source_url "https://github.com/fredwu/crawler"
5
+ @version "1.5.0"
6
+
7
+ def project do
8
+ [
9
+ app: :crawler,
10
+ version: @version,
11
+ elixir: "~> 1.13",
12
+ elixirc_paths: elixirc_paths(Mix.env()),
13
+ package: package(),
14
+ name: "Crawler",
15
+ description: "A high performance web crawler in Elixir.",
16
+ start_permanent: Mix.env() == :prod,
17
+ deps: deps(),
18
+ docs: docs(),
19
+ test_coverage: [tool: ExCoveralls],
20
+ preferred_cli_env: [coveralls: :test],
21
+ aliases: [publish: ["hex.publish", &git_tag/1]],
22
+ dialyzer: [
23
+ plt_add_apps: [:crawler],
24
+ flags: [:error_handling, :race_conditions, :underspecs]
25
+ ]
26
+ ]
27
+ end
28
+
29
+ def application do
30
+ [
31
+ mod: {Crawler, []},
32
+ extra_applications: [:logger, :runtime_tools, :observer, :wx]
33
+ ]
34
+ end
35
+
36
+ defp elixirc_paths(:test), do: ["lib", "test/support"]
37
+ defp elixirc_paths(:dev), do: ["lib", "examples"]
38
+ defp elixirc_paths(_), do: ["lib"]
39
+
40
+ defp deps do
41
+ [
42
+ {:httpoison, "~> 2.1"},
43
+ {:floki, "~> 0.30"},
44
+ {:opq, "~> 4.0"},
45
+ {:retry, "~> 0.10"},
46
+ {:recode, "~> 0.6", only: :dev},
47
+ {:ex_doc, ">= 0.0.0", only: :dev},
48
+ {:dialyxir, "~> 1.1", only: [:dev, :test], runtime: false},
49
+ {:plug_cowboy, "~> 2.0", only: :test},
50
+ {:bypass, "~> 2.1", only: :test},
51
+ {:excoveralls, "~> 0.7", only: :test}
52
+ ]
53
+ end
54
+
55
+ defp package do
56
+ [
57
+ maintainers: ["Fred Wu"],
58
+ licenses: ["MIT"],
59
+ links: %{"GitHub" => @source_url}
60
+ ]
61
+ end
62
+
63
+ defp git_tag(_args) do
64
+ System.cmd("git", ["tag", "v" <> Mix.Project.config()[:version]])
65
+ System.cmd("git", ["push"])
66
+ System.cmd("git", ["push", "--tags"])
67
+ end
68
+
69
+ defp docs do
70
+ [
71
+ extras: ["CHANGELOG.md": [title: "Changelog"], "README.md": [title: "Overview"]],
72
+ main: "readme",
73
+ source_url: @source_url,
74
+ source_ref: "v#{@version}",
75
+ formatters: ["html"]
76
+ ]
77
+ end
78
+ end
@@ -0,0 +1,40 @@
1
+ %{
2
+ "bunt": {:hex, :bunt, "0.2.1", "e2d4792f7bc0ced7583ab54922808919518d0e57ee162901a16a1b6664ef3b14", [:mix], [], "hexpm", "a330bfb4245239787b15005e66ae6845c9cd524a288f0d141c148b02603777a5"},
3
+ "bypass": {:hex, :bypass, "2.1.0", "909782781bf8e20ee86a9cabde36b259d44af8b9f38756173e8f5e2e1fabb9b1", [:mix], [{:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: false]}, {:ranch, "~> 1.3", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "d9b5df8fa5b7a6efa08384e9bbecfe4ce61c77d28a4282f79e02f1ef78d96b80"},
4
+ "certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"},
5
+ "cowboy": {:hex, :cowboy, "2.10.0", "ff9ffeff91dae4ae270dd975642997afe2a1179d94b1887863e43f681a203e26", [:make, :rebar3], [{:cowlib, "2.12.1", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "3afdccb7183cc6f143cb14d3cf51fa00e53db9ec80cdcd525482f5e99bc41d6b"},
6
+ "cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"},
7
+ "cowlib": {:hex, :cowlib, "2.12.1", "a9fa9a625f1d2025fe6b462cb865881329b5caff8f1854d1cbc9f9533f00e1e1", [:make, :rebar3], [], "hexpm", "163b73f6367a7341b33c794c4e88e7dbfe6498ac42dcd69ef44c5bc5507c8db0"},
8
+ "dialyxir": {:hex, :dialyxir, "1.4.1", "a22ed1e7bd3a3e3f197b68d806ef66acb61ee8f57b3ac85fc5d57354c5482a93", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "84b795d6d7796297cca5a3118444b80c7d94f7ce247d49886e7c291e1ae49801"},
9
+ "earmark_parser": {:hex, :earmark_parser, "1.4.36", "487ea8ef9bdc659f085e6e654f3c3feea1d36ac3943edf9d2ef6c98de9174c13", [:mix], [], "hexpm", "a524e395634bdcf60a616efe77fd79561bec2e930d8b82745df06ab4e844400a"},
10
+ "erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"},
11
+ "ex_doc": {:hex, :ex_doc, "0.30.6", "5f8b54854b240a2b55c9734c4b1d0dd7bdd41f71a095d42a70445c03cf05a281", [:mix], [{:earmark_parser, "~> 1.4.31", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "bd48f2ddacf4e482c727f9293d9498e0881597eae6ddc3d9562bd7923375109f"},
12
+ "excoveralls": {:hex, :excoveralls, "0.17.1", "83fa7906ef23aa7fc8ad7ee469c357a63b1b3d55dd701ff5b9ce1f72442b2874", [:mix], [{:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "95bc6fda953e84c60f14da4a198880336205464e75383ec0f570180567985ae0"},
13
+ "floki": {:hex, :floki, "0.34.3", "5e2dcaec5d7c228ce5b1d3501502e308b2d79eb655e4191751a1fe491c37feac", [:mix], [], "hexpm", "9577440eea5b97924b4bf3c7ea55f7b8b6dce589f9b28b096cc294a8dc342341"},
14
+ "gen_stage": {:hex, :gen_stage, "1.2.1", "19d8b5e9a5996d813b8245338a28246307fd8b9c99d1237de199d21efc4c76a1", [:mix], [], "hexpm", "83e8be657fa05b992ffa6ac1e3af6d57aa50aace8f691fcf696ff02f8335b001"},
15
+ "glob_ex": {:hex, :glob_ex, "0.1.4", "fc69cb3f6df9138a1e36e9aa041ef2eab0d4dfe916331425f6bac290d1977e79", [:mix], [], "hexpm", "583d35559dc5b17f14612f7153aaaf6dcc13edf2e383126e2dfb5f2d19c78b89"},
16
+ "hackney": {:hex, :hackney, "1.19.1", "59de4716e985dd2b5cbd4954fa1ae187e2b610a9c4520ffcb0b1653c3d6e5559", [:rebar3], [{:certifi, "~> 2.12.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "8aa08234bdefc269995c63c2282cf3cd0e36febe3a6bfab11b610572fdd1cad0"},
17
+ "httpoison": {:hex, :httpoison, "2.1.0", "655fd9a7b0b95ee3e9a3b535cf7ac8e08ef5229bab187fa86ac4208b122d934b", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "fc455cb4306b43827def4f57299b2d5ac8ac331cb23f517e734a4b78210a160c"},
18
+ "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
19
+ "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"},
20
+ "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"},
21
+ "makeup_elixir": {:hex, :makeup_elixir, "0.16.1", "cc9e3ca312f1cfeccc572b37a09980287e243648108384b97ff2b76e505c3555", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "e127a341ad1b209bd80f7bd1620a15693a9908ed780c3b763bccf7d200c767c6"},
22
+ "makeup_erlang": {:hex, :makeup_erlang, "0.1.2", "ad87296a092a46e03b7e9b0be7631ddcf64c790fa68a9ef5323b6cbb36affc72", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "f3f5a1ca93ce6e092d92b6d9c049bcda58a3b617a8d888f8e7231c85630e8108"},
23
+ "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
24
+ "mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"},
25
+ "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
26
+ "nimble_parsec": {:hex, :nimble_parsec, "1.3.1", "2c54013ecf170e249e9291ed0a62e5832f70a476c61da16f6aac6dca0189f2af", [:mix], [], "hexpm", "2682e3c0b2eb58d90c6375fc0cc30bc7be06f365bf72608804fb9cffa5e1b167"},
27
+ "opq": {:hex, :opq, "4.0.3", "04fd4bc42d8de8ea0175a1758d2f88fd22c2ead5342cbcb7777d899ba7c8b44f", [:mix], [{:gen_stage, "~> 1.1", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "78cb240d11ceeb72008df5613d59ebdc1dc7c1a896a35a500c59075b0931f9c8"},
28
+ "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
29
+ "plug": {:hex, :plug, "1.14.2", "cff7d4ec45b4ae176a227acd94a7ab536d9b37b942c8e8fa6dfc0fff98ff4d80", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "842fc50187e13cf4ac3b253d47d9474ed6c296a8732752835ce4a86acdf68d13"},
30
+ "plug_cowboy": {:hex, :plug_cowboy, "2.6.1", "9a3bbfceeb65eff5f39dab529e5cd79137ac36e913c02067dba3963a26efe9b2", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "de36e1a21f451a18b790f37765db198075c25875c64834bcc82d90b309eb6613"},
31
+ "plug_crypto": {:hex, :plug_crypto, "1.2.5", "918772575e48e81e455818229bf719d4ab4181fcbf7f85b68a35620f78d89ced", [:mix], [], "hexpm", "26549a1d6345e2172eb1c233866756ae44a9609bd33ee6f99147ab3fd87fd842"},
32
+ "ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"},
33
+ "recode": {:hex, :recode, "0.6.4", "a3370bda63376953fb5e4698593c64388f27efd60f6b06763dbcd37e869c159e", [:mix], [{:bunt, "~> 0.2", [hex: :bunt, repo: "hexpm", optional: false]}, {:glob_ex, "~> 0.1", [hex: :glob_ex, repo: "hexpm", optional: false]}, {:rewrite, "~> 0.9", [hex: :rewrite, repo: "hexpm", optional: false]}], "hexpm", "46700acb62d1f585a25cd3c3f5b19377911911d34107c9a5879d5e0bc6544995"},
34
+ "retry": {:hex, :retry, "0.18.0", "dc58ebe22c95aa00bc2459f9e0c5400e6005541cf8539925af0aa027dc860543", [:mix], [], "hexpm", "9483959cc7bf69c9e576d9dfb2b678b71c045d3e6f39ab7c9aa1489df4492d73"},
35
+ "rewrite": {:hex, :rewrite, "0.9.0", "90f2108ee657705bea00fa30d56dc26b8113ddfe1481487b8f6687871316a131", [:mix], [{:glob_ex, "~> 0.1", [hex: :glob_ex, repo: "hexpm", optional: false]}, {:sourceror, "~> 0.13", [hex: :sourceror, repo: "hexpm", optional: false]}], "hexpm", "5ee26ba5ab0ae3c1155b2ba8093d2bbf78346b85c8493bc0bb4b49a3d6b3330f"},
36
+ "sourceror": {:hex, :sourceror, "0.14.0", "b6b8552d0240400d66b6f107c1bab7ac1726e998efc797f178b7b517e928e314", [:mix], [], "hexpm", "809c71270ad48092d40bbe251a133e49ae229433ce103f762a2373b7a10a8d8b"},
37
+ "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"},
38
+ "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"},
39
+ "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"},
40
+ }
@@ -0,0 +1,135 @@
1
+ defmodule IntegrationTest do
2
+ use Crawler.TestCase, async: false
3
+
4
+ import Plug.Conn
5
+
6
+ @moduletag capture_log: true
7
+
8
+ test "integration", %{
9
+ bypass: bypass,
10
+ url: url,
11
+ path: path,
12
+ bypass2: bypass2,
13
+ url2: url2,
14
+ path2: path2
15
+ } do
16
+ linked_url1 = "#{url}/page1.html"
17
+ linked_url2 = "#{url}/dir/page2.html"
18
+ linked_url3 = "#{url2}/page3.html"
19
+
20
+ page1_raw = "<html><a href='#{linked_url2}'>2</a> <a href='#{linked_url3}'>3</a></html>"
21
+ page2_raw = "<html><a href='#{linked_url3}'>3</a></html>"
22
+ page3_raw = "<html><a href='dir/page4'>4</a> <a href='/dir/page4'>4</a></html>"
23
+
24
+ page4_raw =
25
+ "<html><head><script type='text/javascript' src='/javascript.js' /><link rel='stylesheet' href='../styles.css' /></head><a href='../page5.html'>5</a> <img src='../image1.png' /></html>"
26
+
27
+ page5_raw = "<html><a href='/page6'>6</a> <img src='/image2.png' /></html>"
28
+ css_raw = "img { url(image3.png); }"
29
+
30
+ Bypass.expect_once(
31
+ bypass,
32
+ "GET",
33
+ "/page1.html",
34
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page1_raw))
35
+ )
36
+
37
+ Bypass.expect_once(
38
+ bypass,
39
+ "GET",
40
+ "/dir/page2.html",
41
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page2_raw))
42
+ )
43
+
44
+ Bypass.expect_once(
45
+ bypass2,
46
+ "GET",
47
+ "/page3.html",
48
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page3_raw))
49
+ )
50
+
51
+ Bypass.expect_once(
52
+ bypass2,
53
+ "GET",
54
+ "/dir/page4",
55
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page4_raw))
56
+ )
57
+
58
+ Bypass.expect_once(
59
+ bypass2,
60
+ "GET",
61
+ "/page5.html",
62
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page5_raw))
63
+ )
64
+
65
+ Bypass.expect_once(
66
+ bypass2,
67
+ "GET",
68
+ "/image1.png",
69
+ &(&1 |> put_resp_header("content-type", "image/png") |> resp(200, "png"))
70
+ )
71
+
72
+ Bypass.expect_once(
73
+ bypass2,
74
+ "GET",
75
+ "/image2.png",
76
+ &(&1 |> put_resp_header("content-type", "image/png") |> resp(200, "png"))
77
+ )
78
+
79
+ Bypass.expect_once(
80
+ bypass2,
81
+ "GET",
82
+ "/image3.png",
83
+ &(&1 |> put_resp_header("content-type", "image/png") |> resp(200, "png"))
84
+ )
85
+
86
+ Bypass.expect_once(
87
+ bypass2,
88
+ "GET",
89
+ "/styles.css",
90
+ &(&1 |> put_resp_header("content-type", "text/css") |> resp(200, css_raw))
91
+ )
92
+
93
+ Bypass.expect_once(
94
+ bypass2,
95
+ "GET",
96
+ "/javascript.js",
97
+ &(&1 |> put_resp_header("content-type", "application/javascript") |> resp(200, "js"))
98
+ )
99
+
100
+ Crawler.crawl(linked_url1,
101
+ save_to: tmp("integration"),
102
+ max_depths: 4,
103
+ assets: ["js", "css", "images"]
104
+ )
105
+
106
+ page1 =
107
+ "<html><a href='../#{path}/dir/page2.html'>2</a> <a href='../#{path2}/page3.html'>3</a></html>"
108
+
109
+ page2 = "<html><a href='../../#{path2}/page3.html'>3</a></html>"
110
+
111
+ page3 =
112
+ "<html><a href='../#{path2}/dir/page4/index.html'>4</a> <a href='../#{path2}/dir/page4/index.html'>4</a></html>"
113
+
114
+ page4 =
115
+ "<html><head><script type='text/javascript' src='../../../#{path2}/javascript.js' /><link rel='stylesheet' href='../../../#{path2}/styles.css' /></head><a href='../../../#{path2}/page5.html'>5</a> <img src='../../../#{path2}/image1.png' /></html>"
116
+
117
+ page5 =
118
+ "<html><a href='../#{path2}/page6/index.html'>6</a> <img src='../#{path2}/image2.png' /></html>"
119
+
120
+ css = "img { url(../#{path2}/image3.png); }"
121
+
122
+ wait(fn ->
123
+ assert {:ok, page1} == File.read(tmp("integration/#{path}", "page1.html"))
124
+ assert {:ok, page2} == File.read(tmp("integration/#{path}/dir", "page2.html"))
125
+ assert {:ok, page3} == File.read(tmp("integration/#{path2}", "page3.html"))
126
+ assert {:ok, page4} == File.read(tmp("integration/#{path2}/dir/page4", "index.html"))
127
+ assert {:ok, page5} == File.read(tmp("integration/#{path2}", "page5.html"))
128
+ assert {:ok, "png"} == File.read(tmp("integration/#{path2}", "image1.png"))
129
+ assert {:ok, "png"} == File.read(tmp("integration/#{path2}", "image2.png"))
130
+ assert {:ok, "png"} == File.read(tmp("integration/#{path2}", "image3.png"))
131
+ assert {:ok, css} == File.read(tmp("integration/#{path2}", "styles.css"))
132
+ assert {:ok, "js"} == File.read(tmp("integration/#{path2}", "javascript.js"))
133
+ end)
134
+ end
135
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Dispatcher.WorkerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Dispatcher.Worker
5
+
6
+ doctest Worker
7
+ end
@@ -0,0 +1,5 @@
1
+ defmodule Crawler.DispatcherTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ doctest Crawler.Dispatcher
5
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Fetcher.HeaderPreparerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Fetcher.HeaderPreparer
5
+
6
+ doctest HeaderPreparer
7
+ end