powerdlz23 1.2.3 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/package.json +1 -1
  92. package/rubyretriever/.rspec +2 -0
  93. package/rubyretriever/.travis.yml +7 -0
  94. package/rubyretriever/Gemfile +3 -0
  95. package/rubyretriever/Gemfile.lock +64 -0
  96. package/rubyretriever/LICENSE +20 -0
  97. package/rubyretriever/Rakefile +7 -0
  98. package/rubyretriever/bin/rr +79 -0
  99. package/rubyretriever/lib/retriever/cli.rb +25 -0
  100. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  101. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  102. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  103. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  104. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  105. package/rubyretriever/lib/retriever/link.rb +47 -0
  106. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  107. package/rubyretriever/lib/retriever/page.rb +104 -0
  108. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  109. package/rubyretriever/lib/retriever/target.rb +47 -0
  110. package/rubyretriever/lib/retriever/version.rb +4 -0
  111. package/rubyretriever/lib/retriever.rb +15 -0
  112. package/rubyretriever/readme.md +166 -0
  113. package/rubyretriever/rubyretriever.gemspec +41 -0
  114. package/rubyretriever/spec/link_spec.rb +77 -0
  115. package/rubyretriever/spec/page_spec.rb +94 -0
  116. package/rubyretriever/spec/retriever_spec.rb +84 -0
  117. package/rubyretriever/spec/spec_helper.rb +17 -0
  118. package/rubyretriever/spec/target_spec.rb +55 -0
@@ -0,0 +1,82 @@
1
+ defmodule Crawler.Snapper do
2
+ @moduledoc """
3
+ Stores crawled pages offline.
4
+ """
5
+
6
+ require Logger
7
+
8
+ alias Crawler.Snapper.DirMaker
9
+ alias Crawler.Snapper.LinkReplacer
10
+
11
+ @doc """
12
+ In order to store pages offline, it provides the following functionalities:
13
+
14
+ - replaces all URLs to their equivalent relative paths
15
+ - creates directories when necessary to store the files
16
+
17
+ ## Examples
18
+
19
+ iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://hello-world.local"})
20
+ iex> File.read(tmp("snapper/hello-world.local", "index.html"))
21
+ {:ok, "hello"}
22
+
23
+ iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://snapper.local/index.html"})
24
+ iex> File.read(tmp("snapper/snapper.local", "index.html"))
25
+ {:ok, "hello"}
26
+
27
+ iex> Snapper.snap("hello", %{save_to: "nope", url: "http://snapper.local/index.html"})
28
+ {:error, "Cannot write to file nope/snapper.local/index.html, reason: enoent"}
29
+
30
+ iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://snapper.local/hello"})
31
+ iex> File.read(tmp("snapper/snapper.local/hello", "index.html"))
32
+ {:ok, "hello"}
33
+
34
+ iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://snapper.local/hello1/"})
35
+ iex> File.read(tmp("snapper/snapper.local/hello1", "index.html"))
36
+ {:ok, "hello"}
37
+
38
+ iex> Snapper.snap(
39
+ iex> "<a href='http://another.domain/page'></a>",
40
+ iex> %{
41
+ iex> save_to: tmp("snapper"),
42
+ iex> url: "http://snapper.local/depth0",
43
+ iex> depth: 1,
44
+ iex> max_depths: 2,
45
+ iex> html_tag: "a",
46
+ iex> content_type: "text/html",
47
+ iex> }
48
+ iex> )
49
+ iex> File.read(tmp("snapper/snapper.local/depth0", "index.html"))
50
+ {:ok, "<a href='../../another.domain/page/index.html'></a>"}
51
+
52
+ iex> Snapper.snap(
53
+ iex> "<a href='https://another.domain:8888/page'></a>",
54
+ iex> %{
55
+ iex> save_to: tmp("snapper"),
56
+ iex> url: "http://snapper.local:7777/dir/depth1",
57
+ iex> depth: 1,
58
+ iex> max_depths: 2,
59
+ iex> html_tag: "a",
60
+ iex> content_type: "text/html",
61
+ iex> }
62
+ iex> )
63
+ iex> File.read(tmp("snapper/snapper.local-7777/dir/depth1", "index.html"))
64
+ {:ok, "<a href='../../../another.domain-8888/page/index.html'></a>"}
65
+ """
66
+ def snap(body, opts) do
67
+ {:ok, body} = LinkReplacer.replace_links(body, opts)
68
+ file_path = DirMaker.make_dir(opts)
69
+
70
+ case File.write(file_path, body) do
71
+ :ok ->
72
+ {:ok, opts}
73
+
74
+ {:error, reason} ->
75
+ msg = "Cannot write to file #{file_path}, reason: #{reason}"
76
+
77
+ Logger.error(msg)
78
+
79
+ {:error, msg}
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,19 @@
1
+ defmodule Crawler.Store.Counter do
2
+ use Agent
3
+
4
+ def start_link(_args) do
5
+ Agent.start_link(fn -> 0 end, name: __MODULE__)
6
+ end
7
+
8
+ def value do
9
+ Agent.get(__MODULE__, & &1)
10
+ end
11
+
12
+ def inc do
13
+ Agent.update(__MODULE__, &(&1 + 1))
14
+ end
15
+
16
+ def reset do
17
+ Agent.update(__MODULE__, fn _ -> 0 end)
18
+ end
19
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Store.Page do
2
+ @moduledoc """
3
+ An internal struct for keeping the url and content of a crawled page.
4
+ """
5
+
6
+ defstruct [:url, :body, :opts, :processed]
7
+ end
@@ -0,0 +1,87 @@
1
+ defmodule Crawler.Store do
2
+ @moduledoc """
3
+ An internal data store for information related to each crawl.
4
+ """
5
+
6
+ alias Crawler.Store.Counter
7
+ alias Crawler.Store.DB
8
+ alias Crawler.Store.Page
9
+
10
+ use GenServer
11
+
12
+ def start_link(opts) do
13
+ children = [
14
+ {Registry, keys: :unique, name: DB},
15
+ Counter
16
+ ]
17
+
18
+ Supervisor.start_link(
19
+ children,
20
+ [strategy: :one_for_one, name: __MODULE__] ++ opts
21
+ )
22
+ end
23
+
24
+ @doc """
25
+ Initialises a new `Registry` named `Crawler.Store.DB`.
26
+ """
27
+ def init(args) do
28
+ {:ok, args}
29
+ end
30
+
31
+ @doc """
32
+ Finds a stored URL and returns its page data.
33
+ """
34
+ def find({url, scope}) do
35
+ case Registry.lookup(DB, {url, scope}) do
36
+ [{_, page}] -> page
37
+ _ -> nil
38
+ end
39
+ end
40
+
41
+ @doc """
42
+ Finds a stored URL and returns its page data only if it's processed.
43
+ """
44
+ def find_processed({url, scope}) do
45
+ case Registry.match(DB, {url, scope}, %{processed: true}) do
46
+ [{_, page}] -> page
47
+ _ -> nil
48
+ end
49
+ end
50
+
51
+ @doc """
52
+ Adds a URL to the registry.
53
+ """
54
+ def add({url, scope}) do
55
+ Registry.register(DB, {url, scope}, %Page{url: url})
56
+ end
57
+
58
+ @doc """
59
+ Adds the page data for a URL to the registry.
60
+ """
61
+ def add_page_data({url, scope}, body, opts) do
62
+ {_new, _old} = Registry.update_value(DB, {url, scope}, &%{&1 | body: body, opts: opts})
63
+ end
64
+
65
+ @doc """
66
+ Marks a URL as processed in the registry.
67
+ """
68
+ def processed({url, scope}) do
69
+ {_new, _old} = Registry.update_value(DB, {url, scope}, &%{&1 | processed: true})
70
+ end
71
+
72
+ def all_urls do
73
+ Registry.select(DB, [{{:"$1", :_, :_}, [], [:"$1"]}])
74
+ end
75
+
76
+ def ops_inc do
77
+ Counter.inc()
78
+ end
79
+
80
+ def ops_count do
81
+ Counter.value()
82
+ end
83
+
84
+ def ops_reset do
85
+ Counter.reset()
86
+ end
87
+ end
@@ -0,0 +1,62 @@
1
+ defmodule Crawler.Worker do
2
+ @moduledoc """
3
+ Handles the crawl tasks.
4
+ """
5
+
6
+ require Logger
7
+
8
+ alias Crawler.Fetcher
9
+ alias Crawler.Store
10
+ alias Crawler.Store.Page
11
+
12
+ use GenServer
13
+
14
+ def init(args) do
15
+ :timer.send_after(args[:timeout], :stop)
16
+
17
+ {:ok, args}
18
+ end
19
+
20
+ @doc """
21
+ Runs the worker that casts data to itself to kick off the crawl workflow.
22
+ """
23
+ def run(opts) do
24
+ Logger.debug("Running worker with opts: #{inspect(opts)}")
25
+
26
+ {:ok, pid} = GenServer.start_link(__MODULE__, opts, hibernate_after: 0)
27
+
28
+ GenServer.cast(pid, opts)
29
+ end
30
+
31
+ @doc """
32
+ A crawl workflow that delegates responsibilities to:
33
+
34
+ - `Crawler.Fetcher.fetch/1`
35
+ - `Crawler.Parser.parse/1` (or a custom parser)
36
+ """
37
+ def handle_cast(_req, state) do
38
+ Logger.debug("Running worker with opts: #{inspect(state)}")
39
+
40
+ state
41
+ |> Fetcher.fetch()
42
+ |> state[:parser].parse()
43
+ |> mark_processed()
44
+
45
+ {:noreply, state, :hibernate}
46
+ end
47
+
48
+ def handle_info(:stop, state) do
49
+ {:stop, :normal, state}
50
+ end
51
+
52
+ def handle_info(_msg, state) do
53
+ {:noreply, state}
54
+ end
55
+
56
+ defp mark_processed({:ok, %Page{url: url, opts: opts}}) do
57
+ Store.ops_inc()
58
+ Store.processed({url, opts[:scope]})
59
+ end
60
+
61
+ defp mark_processed(_), do: nil
62
+ end
@@ -0,0 +1,91 @@
1
+ defmodule Crawler do
2
+ @moduledoc """
3
+ A high performance web crawler in Elixir.
4
+ """
5
+
6
+ alias Crawler.Options
7
+ alias Crawler.QueueHandler
8
+ alias Crawler.Store
9
+ alias Crawler.Worker
10
+
11
+ use Application
12
+
13
+ @doc """
14
+ Crawler is an application that gets started automatically with:
15
+
16
+ - a `Crawler.Store` that initiates a `Registry` for keeping internal data
17
+ """
18
+ def start(_type, _args) do
19
+ children = [
20
+ Store,
21
+ {DynamicSupervisor, name: Crawler.QueueSupervisor, strategy: :one_for_one}
22
+ ]
23
+
24
+ Supervisor.start_link(children, strategy: :one_for_one, name: Crawler)
25
+ end
26
+
27
+ @doc """
28
+ Enqueues a crawl, via `Crawler.QueueHandler.enqueue/1`.
29
+
30
+ This is the default crawl behaviour as the queue determines when an actual
31
+ crawl should happen based on the available workers and the rate limit. The
32
+ queue kicks off `Crawler.Dispatcher.Worker` which in turn calls
33
+ `Crawler.crawl_now/1`.
34
+ """
35
+ def crawl(url, opts \\ []) do
36
+ opts =
37
+ opts
38
+ |> Enum.into(%{})
39
+ |> Options.assign_defaults()
40
+ |> Options.assign_scope()
41
+ |> Options.assign_url(url)
42
+ |> Options.perform_default_actions()
43
+
44
+ if Store.ops_count() < opts[:max_pages] do
45
+ QueueHandler.enqueue(opts)
46
+ end
47
+ end
48
+
49
+ @doc """
50
+ Stops the crawler.
51
+ """
52
+ def stop(opts) do
53
+ Process.flag(:trap_exit, true)
54
+ OPQ.stop(opts[:queue])
55
+ end
56
+
57
+ @doc """
58
+ Pauses the crawler.
59
+ """
60
+ def pause(opts), do: OPQ.pause(opts[:queue])
61
+
62
+ @doc """
63
+ Resumes the crawler after it was paused.
64
+ """
65
+ def resume(opts), do: OPQ.resume(opts[:queue])
66
+
67
+ @doc """
68
+ Checks whether the crawler is still crawling.
69
+ """
70
+ def running?(opts) do
71
+ Process.sleep(10)
72
+
73
+ cond do
74
+ opts[:queue] |> OPQ.info() |> elem(0) == :paused -> false
75
+ Store.ops_count() <= 1 -> true
76
+ OPQ.queue(opts[:queue]) |> Enum.any?() -> true
77
+ true -> false
78
+ end
79
+ end
80
+
81
+ @doc """
82
+ Crawls immediately, this is used by `Crawler.Dispatcher.Worker.start_link/1`.
83
+
84
+ For general purpose use cases, always use `Crawler.crawl/2` instead.
85
+ """
86
+ def crawl_now(opts) do
87
+ if Store.ops_count() < opts[:max_pages] do
88
+ Worker.run(opts)
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,78 @@
1
+ defmodule Crawler.Mixfile do
2
+ use Mix.Project
3
+
4
+ @source_url "https://github.com/fredwu/crawler"
5
+ @version "1.5.0"
6
+
7
+ def project do
8
+ [
9
+ app: :crawler,
10
+ version: @version,
11
+ elixir: "~> 1.13",
12
+ elixirc_paths: elixirc_paths(Mix.env()),
13
+ package: package(),
14
+ name: "Crawler",
15
+ description: "A high performance web crawler in Elixir.",
16
+ start_permanent: Mix.env() == :prod,
17
+ deps: deps(),
18
+ docs: docs(),
19
+ test_coverage: [tool: ExCoveralls],
20
+ preferred_cli_env: [coveralls: :test],
21
+ aliases: [publish: ["hex.publish", &git_tag/1]],
22
+ dialyzer: [
23
+ plt_add_apps: [:crawler],
24
+ flags: [:error_handling, :race_conditions, :underspecs]
25
+ ]
26
+ ]
27
+ end
28
+
29
+ def application do
30
+ [
31
+ mod: {Crawler, []},
32
+ extra_applications: [:logger, :runtime_tools, :observer, :wx]
33
+ ]
34
+ end
35
+
36
+ defp elixirc_paths(:test), do: ["lib", "test/support"]
37
+ defp elixirc_paths(:dev), do: ["lib", "examples"]
38
+ defp elixirc_paths(_), do: ["lib"]
39
+
40
+ defp deps do
41
+ [
42
+ {:httpoison, "~> 2.1"},
43
+ {:floki, "~> 0.30"},
44
+ {:opq, "~> 4.0"},
45
+ {:retry, "~> 0.10"},
46
+ {:recode, "~> 0.6", only: :dev},
47
+ {:ex_doc, ">= 0.0.0", only: :dev},
48
+ {:dialyxir, "~> 1.1", only: [:dev, :test], runtime: false},
49
+ {:plug_cowboy, "~> 2.0", only: :test},
50
+ {:bypass, "~> 2.1", only: :test},
51
+ {:excoveralls, "~> 0.7", only: :test}
52
+ ]
53
+ end
54
+
55
+ defp package do
56
+ [
57
+ maintainers: ["Fred Wu"],
58
+ licenses: ["MIT"],
59
+ links: %{"GitHub" => @source_url}
60
+ ]
61
+ end
62
+
63
+ defp git_tag(_args) do
64
+ System.cmd("git", ["tag", "v" <> Mix.Project.config()[:version]])
65
+ System.cmd("git", ["push"])
66
+ System.cmd("git", ["push", "--tags"])
67
+ end
68
+
69
+ defp docs do
70
+ [
71
+ extras: ["CHANGELOG.md": [title: "Changelog"], "README.md": [title: "Overview"]],
72
+ main: "readme",
73
+ source_url: @source_url,
74
+ source_ref: "v#{@version}",
75
+ formatters: ["html"]
76
+ ]
77
+ end
78
+ end
@@ -0,0 +1,40 @@
1
+ %{
2
+ "bunt": {:hex, :bunt, "0.2.1", "e2d4792f7bc0ced7583ab54922808919518d0e57ee162901a16a1b6664ef3b14", [:mix], [], "hexpm", "a330bfb4245239787b15005e66ae6845c9cd524a288f0d141c148b02603777a5"},
3
+ "bypass": {:hex, :bypass, "2.1.0", "909782781bf8e20ee86a9cabde36b259d44af8b9f38756173e8f5e2e1fabb9b1", [:mix], [{:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: false]}, {:ranch, "~> 1.3", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "d9b5df8fa5b7a6efa08384e9bbecfe4ce61c77d28a4282f79e02f1ef78d96b80"},
4
+ "certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"},
5
+ "cowboy": {:hex, :cowboy, "2.10.0", "ff9ffeff91dae4ae270dd975642997afe2a1179d94b1887863e43f681a203e26", [:make, :rebar3], [{:cowlib, "2.12.1", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "3afdccb7183cc6f143cb14d3cf51fa00e53db9ec80cdcd525482f5e99bc41d6b"},
6
+ "cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"},
7
+ "cowlib": {:hex, :cowlib, "2.12.1", "a9fa9a625f1d2025fe6b462cb865881329b5caff8f1854d1cbc9f9533f00e1e1", [:make, :rebar3], [], "hexpm", "163b73f6367a7341b33c794c4e88e7dbfe6498ac42dcd69ef44c5bc5507c8db0"},
8
+ "dialyxir": {:hex, :dialyxir, "1.4.1", "a22ed1e7bd3a3e3f197b68d806ef66acb61ee8f57b3ac85fc5d57354c5482a93", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "84b795d6d7796297cca5a3118444b80c7d94f7ce247d49886e7c291e1ae49801"},
9
+ "earmark_parser": {:hex, :earmark_parser, "1.4.36", "487ea8ef9bdc659f085e6e654f3c3feea1d36ac3943edf9d2ef6c98de9174c13", [:mix], [], "hexpm", "a524e395634bdcf60a616efe77fd79561bec2e930d8b82745df06ab4e844400a"},
10
+ "erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"},
11
+ "ex_doc": {:hex, :ex_doc, "0.30.6", "5f8b54854b240a2b55c9734c4b1d0dd7bdd41f71a095d42a70445c03cf05a281", [:mix], [{:earmark_parser, "~> 1.4.31", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "bd48f2ddacf4e482c727f9293d9498e0881597eae6ddc3d9562bd7923375109f"},
12
+ "excoveralls": {:hex, :excoveralls, "0.17.1", "83fa7906ef23aa7fc8ad7ee469c357a63b1b3d55dd701ff5b9ce1f72442b2874", [:mix], [{:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "95bc6fda953e84c60f14da4a198880336205464e75383ec0f570180567985ae0"},
13
+ "floki": {:hex, :floki, "0.34.3", "5e2dcaec5d7c228ce5b1d3501502e308b2d79eb655e4191751a1fe491c37feac", [:mix], [], "hexpm", "9577440eea5b97924b4bf3c7ea55f7b8b6dce589f9b28b096cc294a8dc342341"},
14
+ "gen_stage": {:hex, :gen_stage, "1.2.1", "19d8b5e9a5996d813b8245338a28246307fd8b9c99d1237de199d21efc4c76a1", [:mix], [], "hexpm", "83e8be657fa05b992ffa6ac1e3af6d57aa50aace8f691fcf696ff02f8335b001"},
15
+ "glob_ex": {:hex, :glob_ex, "0.1.4", "fc69cb3f6df9138a1e36e9aa041ef2eab0d4dfe916331425f6bac290d1977e79", [:mix], [], "hexpm", "583d35559dc5b17f14612f7153aaaf6dcc13edf2e383126e2dfb5f2d19c78b89"},
16
+ "hackney": {:hex, :hackney, "1.19.1", "59de4716e985dd2b5cbd4954fa1ae187e2b610a9c4520ffcb0b1653c3d6e5559", [:rebar3], [{:certifi, "~> 2.12.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "8aa08234bdefc269995c63c2282cf3cd0e36febe3a6bfab11b610572fdd1cad0"},
17
+ "httpoison": {:hex, :httpoison, "2.1.0", "655fd9a7b0b95ee3e9a3b535cf7ac8e08ef5229bab187fa86ac4208b122d934b", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "fc455cb4306b43827def4f57299b2d5ac8ac331cb23f517e734a4b78210a160c"},
18
+ "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
19
+ "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"},
20
+ "makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"},
21
+ "makeup_elixir": {:hex, :makeup_elixir, "0.16.1", "cc9e3ca312f1cfeccc572b37a09980287e243648108384b97ff2b76e505c3555", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "e127a341ad1b209bd80f7bd1620a15693a9908ed780c3b763bccf7d200c767c6"},
22
+ "makeup_erlang": {:hex, :makeup_erlang, "0.1.2", "ad87296a092a46e03b7e9b0be7631ddcf64c790fa68a9ef5323b6cbb36affc72", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "f3f5a1ca93ce6e092d92b6d9c049bcda58a3b617a8d888f8e7231c85630e8108"},
23
+ "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
24
+ "mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"},
25
+ "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
26
+ "nimble_parsec": {:hex, :nimble_parsec, "1.3.1", "2c54013ecf170e249e9291ed0a62e5832f70a476c61da16f6aac6dca0189f2af", [:mix], [], "hexpm", "2682e3c0b2eb58d90c6375fc0cc30bc7be06f365bf72608804fb9cffa5e1b167"},
27
+ "opq": {:hex, :opq, "4.0.3", "04fd4bc42d8de8ea0175a1758d2f88fd22c2ead5342cbcb7777d899ba7c8b44f", [:mix], [{:gen_stage, "~> 1.1", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "78cb240d11ceeb72008df5613d59ebdc1dc7c1a896a35a500c59075b0931f9c8"},
28
+ "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
29
+ "plug": {:hex, :plug, "1.14.2", "cff7d4ec45b4ae176a227acd94a7ab536d9b37b942c8e8fa6dfc0fff98ff4d80", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "842fc50187e13cf4ac3b253d47d9474ed6c296a8732752835ce4a86acdf68d13"},
30
+ "plug_cowboy": {:hex, :plug_cowboy, "2.6.1", "9a3bbfceeb65eff5f39dab529e5cd79137ac36e913c02067dba3963a26efe9b2", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "de36e1a21f451a18b790f37765db198075c25875c64834bcc82d90b309eb6613"},
31
+ "plug_crypto": {:hex, :plug_crypto, "1.2.5", "918772575e48e81e455818229bf719d4ab4181fcbf7f85b68a35620f78d89ced", [:mix], [], "hexpm", "26549a1d6345e2172eb1c233866756ae44a9609bd33ee6f99147ab3fd87fd842"},
32
+ "ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"},
33
+ "recode": {:hex, :recode, "0.6.4", "a3370bda63376953fb5e4698593c64388f27efd60f6b06763dbcd37e869c159e", [:mix], [{:bunt, "~> 0.2", [hex: :bunt, repo: "hexpm", optional: false]}, {:glob_ex, "~> 0.1", [hex: :glob_ex, repo: "hexpm", optional: false]}, {:rewrite, "~> 0.9", [hex: :rewrite, repo: "hexpm", optional: false]}], "hexpm", "46700acb62d1f585a25cd3c3f5b19377911911d34107c9a5879d5e0bc6544995"},
34
+ "retry": {:hex, :retry, "0.18.0", "dc58ebe22c95aa00bc2459f9e0c5400e6005541cf8539925af0aa027dc860543", [:mix], [], "hexpm", "9483959cc7bf69c9e576d9dfb2b678b71c045d3e6f39ab7c9aa1489df4492d73"},
35
+ "rewrite": {:hex, :rewrite, "0.9.0", "90f2108ee657705bea00fa30d56dc26b8113ddfe1481487b8f6687871316a131", [:mix], [{:glob_ex, "~> 0.1", [hex: :glob_ex, repo: "hexpm", optional: false]}, {:sourceror, "~> 0.13", [hex: :sourceror, repo: "hexpm", optional: false]}], "hexpm", "5ee26ba5ab0ae3c1155b2ba8093d2bbf78346b85c8493bc0bb4b49a3d6b3330f"},
36
+ "sourceror": {:hex, :sourceror, "0.14.0", "b6b8552d0240400d66b6f107c1bab7ac1726e998efc797f178b7b517e928e314", [:mix], [], "hexpm", "809c71270ad48092d40bbe251a133e49ae229433ce103f762a2373b7a10a8d8b"},
37
+ "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"},
38
+ "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"},
39
+ "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"},
40
+ }
@@ -0,0 +1,135 @@
1
+ defmodule IntegrationTest do
2
+ use Crawler.TestCase, async: false
3
+
4
+ import Plug.Conn
5
+
6
+ @moduletag capture_log: true
7
+
8
+ test "integration", %{
9
+ bypass: bypass,
10
+ url: url,
11
+ path: path,
12
+ bypass2: bypass2,
13
+ url2: url2,
14
+ path2: path2
15
+ } do
16
+ linked_url1 = "#{url}/page1.html"
17
+ linked_url2 = "#{url}/dir/page2.html"
18
+ linked_url3 = "#{url2}/page3.html"
19
+
20
+ page1_raw = "<html><a href='#{linked_url2}'>2</a> <a href='#{linked_url3}'>3</a></html>"
21
+ page2_raw = "<html><a href='#{linked_url3}'>3</a></html>"
22
+ page3_raw = "<html><a href='dir/page4'>4</a> <a href='/dir/page4'>4</a></html>"
23
+
24
+ page4_raw =
25
+ "<html><head><script type='text/javascript' src='/javascript.js' /><link rel='stylesheet' href='../styles.css' /></head><a href='../page5.html'>5</a> <img src='../image1.png' /></html>"
26
+
27
+ page5_raw = "<html><a href='/page6'>6</a> <img src='/image2.png' /></html>"
28
+ css_raw = "img { url(image3.png); }"
29
+
30
+ Bypass.expect_once(
31
+ bypass,
32
+ "GET",
33
+ "/page1.html",
34
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page1_raw))
35
+ )
36
+
37
+ Bypass.expect_once(
38
+ bypass,
39
+ "GET",
40
+ "/dir/page2.html",
41
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page2_raw))
42
+ )
43
+
44
+ Bypass.expect_once(
45
+ bypass2,
46
+ "GET",
47
+ "/page3.html",
48
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page3_raw))
49
+ )
50
+
51
+ Bypass.expect_once(
52
+ bypass2,
53
+ "GET",
54
+ "/dir/page4",
55
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page4_raw))
56
+ )
57
+
58
+ Bypass.expect_once(
59
+ bypass2,
60
+ "GET",
61
+ "/page5.html",
62
+ &(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page5_raw))
63
+ )
64
+
65
+ Bypass.expect_once(
66
+ bypass2,
67
+ "GET",
68
+ "/image1.png",
69
+ &(&1 |> put_resp_header("content-type", "image/png") |> resp(200, "png"))
70
+ )
71
+
72
+ Bypass.expect_once(
73
+ bypass2,
74
+ "GET",
75
+ "/image2.png",
76
+ &(&1 |> put_resp_header("content-type", "image/png") |> resp(200, "png"))
77
+ )
78
+
79
+ Bypass.expect_once(
80
+ bypass2,
81
+ "GET",
82
+ "/image3.png",
83
+ &(&1 |> put_resp_header("content-type", "image/png") |> resp(200, "png"))
84
+ )
85
+
86
+ Bypass.expect_once(
87
+ bypass2,
88
+ "GET",
89
+ "/styles.css",
90
+ &(&1 |> put_resp_header("content-type", "text/css") |> resp(200, css_raw))
91
+ )
92
+
93
+ Bypass.expect_once(
94
+ bypass2,
95
+ "GET",
96
+ "/javascript.js",
97
+ &(&1 |> put_resp_header("content-type", "application/javascript") |> resp(200, "js"))
98
+ )
99
+
100
+ Crawler.crawl(linked_url1,
101
+ save_to: tmp("integration"),
102
+ max_depths: 4,
103
+ assets: ["js", "css", "images"]
104
+ )
105
+
106
+ page1 =
107
+ "<html><a href='../#{path}/dir/page2.html'>2</a> <a href='../#{path2}/page3.html'>3</a></html>"
108
+
109
+ page2 = "<html><a href='../../#{path2}/page3.html'>3</a></html>"
110
+
111
+ page3 =
112
+ "<html><a href='../#{path2}/dir/page4/index.html'>4</a> <a href='../#{path2}/dir/page4/index.html'>4</a></html>"
113
+
114
+ page4 =
115
+ "<html><head><script type='text/javascript' src='../../../#{path2}/javascript.js' /><link rel='stylesheet' href='../../../#{path2}/styles.css' /></head><a href='../../../#{path2}/page5.html'>5</a> <img src='../../../#{path2}/image1.png' /></html>"
116
+
117
+ page5 =
118
+ "<html><a href='../#{path2}/page6/index.html'>6</a> <img src='../#{path2}/image2.png' /></html>"
119
+
120
+ css = "img { url(../#{path2}/image3.png); }"
121
+
122
+ wait(fn ->
123
+ assert {:ok, page1} == File.read(tmp("integration/#{path}", "page1.html"))
124
+ assert {:ok, page2} == File.read(tmp("integration/#{path}/dir", "page2.html"))
125
+ assert {:ok, page3} == File.read(tmp("integration/#{path2}", "page3.html"))
126
+ assert {:ok, page4} == File.read(tmp("integration/#{path2}/dir/page4", "index.html"))
127
+ assert {:ok, page5} == File.read(tmp("integration/#{path2}", "page5.html"))
128
+ assert {:ok, "png"} == File.read(tmp("integration/#{path2}", "image1.png"))
129
+ assert {:ok, "png"} == File.read(tmp("integration/#{path2}", "image2.png"))
130
+ assert {:ok, "png"} == File.read(tmp("integration/#{path2}", "image3.png"))
131
+ assert {:ok, css} == File.read(tmp("integration/#{path2}", "styles.css"))
132
+ assert {:ok, "js"} == File.read(tmp("integration/#{path2}", "javascript.js"))
133
+ end)
134
+ end
135
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Dispatcher.WorkerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Dispatcher.Worker
5
+
6
+ doctest Worker
7
+ end
@@ -0,0 +1,5 @@
1
+ defmodule Crawler.DispatcherTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ doctest Crawler.Dispatcher
5
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.Fetcher.HeaderPreparerTest do
2
+ use Crawler.TestCase, async: true
3
+
4
+ alias Crawler.Fetcher.HeaderPreparer
5
+
6
+ doctest HeaderPreparer
7
+ end