powerdlz23 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Spider/README.md +19 -0
- package/Spider/domain.py +18 -0
- package/Spider/general.py +51 -0
- package/Spider/link_finder.py +25 -0
- package/Spider/main.py +50 -0
- package/Spider/spider.py +74 -0
- package/crawler/.formatter.exs +5 -0
- package/crawler/.github/workflows/ci.yml +29 -0
- package/crawler/.recode.exs +33 -0
- package/crawler/.tool-versions +2 -0
- package/crawler/CHANGELOG.md +82 -0
- package/crawler/README.md +198 -0
- package/crawler/architecture.svg +4 -0
- package/crawler/config/config.exs +9 -0
- package/crawler/config/dev.exs +5 -0
- package/crawler/config/test.exs +5 -0
- package/crawler/examples/google_search/scraper.ex +37 -0
- package/crawler/examples/google_search/url_filter.ex +11 -0
- package/crawler/examples/google_search.ex +77 -0
- package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
- package/crawler/lib/crawler/dispatcher.ex +20 -0
- package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
- package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
- package/crawler/lib/crawler/fetcher/policer.ex +77 -0
- package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
- package/crawler/lib/crawler/fetcher/requester.ex +32 -0
- package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
- package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
- package/crawler/lib/crawler/fetcher.ex +81 -0
- package/crawler/lib/crawler/http.ex +7 -0
- package/crawler/lib/crawler/linker/path_builder.ex +71 -0
- package/crawler/lib/crawler/linker/path_expander.ex +59 -0
- package/crawler/lib/crawler/linker/path_finder.ex +106 -0
- package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
- package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
- package/crawler/lib/crawler/linker.ex +173 -0
- package/crawler/lib/crawler/options.ex +127 -0
- package/crawler/lib/crawler/parser/css_parser.ex +37 -0
- package/crawler/lib/crawler/parser/guarder.ex +38 -0
- package/crawler/lib/crawler/parser/html_parser.ex +41 -0
- package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
- package/crawler/lib/crawler/parser/link_parser.ex +50 -0
- package/crawler/lib/crawler/parser.ex +122 -0
- package/crawler/lib/crawler/queue_handler.ex +45 -0
- package/crawler/lib/crawler/scraper.ex +28 -0
- package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
- package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
- package/crawler/lib/crawler/snapper.ex +82 -0
- package/crawler/lib/crawler/store/counter.ex +19 -0
- package/crawler/lib/crawler/store/page.ex +7 -0
- package/crawler/lib/crawler/store.ex +87 -0
- package/crawler/lib/crawler/worker.ex +62 -0
- package/crawler/lib/crawler.ex +91 -0
- package/crawler/mix.exs +78 -0
- package/crawler/mix.lock +40 -0
- package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
- package/crawler/test/integration_test.exs +135 -0
- package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
- package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
- package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
- package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
- package/crawler/test/lib/crawler/http_test.exs +47 -0
- package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
- package/crawler/test/lib/crawler/linker_test.exs +7 -0
- package/crawler/test/lib/crawler/options_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser_test.exs +8 -0
- package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
- package/crawler/test/lib/crawler/scraper_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper_test.exs +9 -0
- package/crawler/test/lib/crawler/worker_test.exs +5 -0
- package/crawler/test/lib/crawler_test.exs +295 -0
- package/crawler/test/support/test_case.ex +24 -0
- package/crawler/test/support/test_helpers.ex +28 -0
- package/crawler/test/test_helper.exs +7 -0
- package/package.json +1 -1
- package/rubyretriever/.rspec +2 -0
- package/rubyretriever/.travis.yml +7 -0
- package/rubyretriever/Gemfile +3 -0
- package/rubyretriever/Gemfile.lock +64 -0
- package/rubyretriever/LICENSE +20 -0
- package/rubyretriever/Rakefile +7 -0
- package/rubyretriever/bin/rr +79 -0
- package/rubyretriever/lib/retriever/cli.rb +25 -0
- package/rubyretriever/lib/retriever/core_ext.rb +13 -0
- package/rubyretriever/lib/retriever/fetch.rb +268 -0
- package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
- package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
- package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
- package/rubyretriever/lib/retriever/link.rb +47 -0
- package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
- package/rubyretriever/lib/retriever/page.rb +104 -0
- package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
- package/rubyretriever/lib/retriever/target.rb +47 -0
- package/rubyretriever/lib/retriever/version.rb +4 -0
- package/rubyretriever/lib/retriever.rb +15 -0
- package/rubyretriever/readme.md +166 -0
- package/rubyretriever/rubyretriever.gemspec +41 -0
- package/rubyretriever/spec/link_spec.rb +77 -0
- package/rubyretriever/spec/page_spec.rb +94 -0
- package/rubyretriever/spec/retriever_spec.rb +84 -0
- package/rubyretriever/spec/spec_helper.rb +17 -0
- package/rubyretriever/spec/target_spec.rb +55 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
defmodule Crawler.Snapper do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Stores crawled pages offline.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
require Logger
|
|
7
|
+
|
|
8
|
+
alias Crawler.Snapper.DirMaker
|
|
9
|
+
alias Crawler.Snapper.LinkReplacer
|
|
10
|
+
|
|
11
|
+
@doc """
|
|
12
|
+
In order to store pages offline, it provides the following functionalities:
|
|
13
|
+
|
|
14
|
+
- replaces all URLs to their equivalent relative paths
|
|
15
|
+
- creates directories when necessary to store the files
|
|
16
|
+
|
|
17
|
+
## Examples
|
|
18
|
+
|
|
19
|
+
iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://hello-world.local"})
|
|
20
|
+
iex> File.read(tmp("snapper/hello-world.local", "index.html"))
|
|
21
|
+
{:ok, "hello"}
|
|
22
|
+
|
|
23
|
+
iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://snapper.local/index.html"})
|
|
24
|
+
iex> File.read(tmp("snapper/snapper.local", "index.html"))
|
|
25
|
+
{:ok, "hello"}
|
|
26
|
+
|
|
27
|
+
iex> Snapper.snap("hello", %{save_to: "nope", url: "http://snapper.local/index.html"})
|
|
28
|
+
{:error, "Cannot write to file nope/snapper.local/index.html, reason: enoent"}
|
|
29
|
+
|
|
30
|
+
iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://snapper.local/hello"})
|
|
31
|
+
iex> File.read(tmp("snapper/snapper.local/hello", "index.html"))
|
|
32
|
+
{:ok, "hello"}
|
|
33
|
+
|
|
34
|
+
iex> Snapper.snap("hello", %{save_to: tmp("snapper"), url: "http://snapper.local/hello1/"})
|
|
35
|
+
iex> File.read(tmp("snapper/snapper.local/hello1", "index.html"))
|
|
36
|
+
{:ok, "hello"}
|
|
37
|
+
|
|
38
|
+
iex> Snapper.snap(
|
|
39
|
+
iex> "<a href='http://another.domain/page'></a>",
|
|
40
|
+
iex> %{
|
|
41
|
+
iex> save_to: tmp("snapper"),
|
|
42
|
+
iex> url: "http://snapper.local/depth0",
|
|
43
|
+
iex> depth: 1,
|
|
44
|
+
iex> max_depths: 2,
|
|
45
|
+
iex> html_tag: "a",
|
|
46
|
+
iex> content_type: "text/html",
|
|
47
|
+
iex> }
|
|
48
|
+
iex> )
|
|
49
|
+
iex> File.read(tmp("snapper/snapper.local/depth0", "index.html"))
|
|
50
|
+
{:ok, "<a href='../../another.domain/page/index.html'></a>"}
|
|
51
|
+
|
|
52
|
+
iex> Snapper.snap(
|
|
53
|
+
iex> "<a href='https://another.domain:8888/page'></a>",
|
|
54
|
+
iex> %{
|
|
55
|
+
iex> save_to: tmp("snapper"),
|
|
56
|
+
iex> url: "http://snapper.local:7777/dir/depth1",
|
|
57
|
+
iex> depth: 1,
|
|
58
|
+
iex> max_depths: 2,
|
|
59
|
+
iex> html_tag: "a",
|
|
60
|
+
iex> content_type: "text/html",
|
|
61
|
+
iex> }
|
|
62
|
+
iex> )
|
|
63
|
+
iex> File.read(tmp("snapper/snapper.local-7777/dir/depth1", "index.html"))
|
|
64
|
+
{:ok, "<a href='../../../another.domain-8888/page/index.html'></a>"}
|
|
65
|
+
"""
|
|
66
|
+
def snap(body, opts) do
|
|
67
|
+
{:ok, body} = LinkReplacer.replace_links(body, opts)
|
|
68
|
+
file_path = DirMaker.make_dir(opts)
|
|
69
|
+
|
|
70
|
+
case File.write(file_path, body) do
|
|
71
|
+
:ok ->
|
|
72
|
+
{:ok, opts}
|
|
73
|
+
|
|
74
|
+
{:error, reason} ->
|
|
75
|
+
msg = "Cannot write to file #{file_path}, reason: #{reason}"
|
|
76
|
+
|
|
77
|
+
Logger.error(msg)
|
|
78
|
+
|
|
79
|
+
{:error, msg}
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
defmodule Crawler.Store.Counter do
|
|
2
|
+
use Agent
|
|
3
|
+
|
|
4
|
+
def start_link(_args) do
|
|
5
|
+
Agent.start_link(fn -> 0 end, name: __MODULE__)
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
def value do
|
|
9
|
+
Agent.get(__MODULE__, & &1)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def inc do
|
|
13
|
+
Agent.update(__MODULE__, &(&1 + 1))
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def reset do
|
|
17
|
+
Agent.update(__MODULE__, fn _ -> 0 end)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
defmodule Crawler.Store do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
An internal data store for information related to each crawl.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Store.Counter
|
|
7
|
+
alias Crawler.Store.DB
|
|
8
|
+
alias Crawler.Store.Page
|
|
9
|
+
|
|
10
|
+
use GenServer
|
|
11
|
+
|
|
12
|
+
def start_link(opts) do
|
|
13
|
+
children = [
|
|
14
|
+
{Registry, keys: :unique, name: DB},
|
|
15
|
+
Counter
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
Supervisor.start_link(
|
|
19
|
+
children,
|
|
20
|
+
[strategy: :one_for_one, name: __MODULE__] ++ opts
|
|
21
|
+
)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
@doc """
|
|
25
|
+
Initialises a new `Registry` named `Crawler.Store.DB`.
|
|
26
|
+
"""
|
|
27
|
+
def init(args) do
|
|
28
|
+
{:ok, args}
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
@doc """
|
|
32
|
+
Finds a stored URL and returns its page data.
|
|
33
|
+
"""
|
|
34
|
+
def find({url, scope}) do
|
|
35
|
+
case Registry.lookup(DB, {url, scope}) do
|
|
36
|
+
[{_, page}] -> page
|
|
37
|
+
_ -> nil
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
@doc """
|
|
42
|
+
Finds a stored URL and returns its page data only if it's processed.
|
|
43
|
+
"""
|
|
44
|
+
def find_processed({url, scope}) do
|
|
45
|
+
case Registry.match(DB, {url, scope}, %{processed: true}) do
|
|
46
|
+
[{_, page}] -> page
|
|
47
|
+
_ -> nil
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
@doc """
|
|
52
|
+
Adds a URL to the registry.
|
|
53
|
+
"""
|
|
54
|
+
def add({url, scope}) do
|
|
55
|
+
Registry.register(DB, {url, scope}, %Page{url: url})
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
@doc """
|
|
59
|
+
Adds the page data for a URL to the registry.
|
|
60
|
+
"""
|
|
61
|
+
def add_page_data({url, scope}, body, opts) do
|
|
62
|
+
{_new, _old} = Registry.update_value(DB, {url, scope}, &%{&1 | body: body, opts: opts})
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
@doc """
|
|
66
|
+
Marks a URL as processed in the registry.
|
|
67
|
+
"""
|
|
68
|
+
def processed({url, scope}) do
|
|
69
|
+
{_new, _old} = Registry.update_value(DB, {url, scope}, &%{&1 | processed: true})
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def all_urls do
|
|
73
|
+
Registry.select(DB, [{{:"$1", :_, :_}, [], [:"$1"]}])
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def ops_inc do
|
|
77
|
+
Counter.inc()
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def ops_count do
|
|
81
|
+
Counter.value()
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def ops_reset do
|
|
85
|
+
Counter.reset()
|
|
86
|
+
end
|
|
87
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
defmodule Crawler.Worker do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Handles the crawl tasks.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
require Logger
|
|
7
|
+
|
|
8
|
+
alias Crawler.Fetcher
|
|
9
|
+
alias Crawler.Store
|
|
10
|
+
alias Crawler.Store.Page
|
|
11
|
+
|
|
12
|
+
use GenServer
|
|
13
|
+
|
|
14
|
+
def init(args) do
|
|
15
|
+
:timer.send_after(args[:timeout], :stop)
|
|
16
|
+
|
|
17
|
+
{:ok, args}
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
@doc """
|
|
21
|
+
Runs the worker that casts data to itself to kick off the crawl workflow.
|
|
22
|
+
"""
|
|
23
|
+
def run(opts) do
|
|
24
|
+
Logger.debug("Running worker with opts: #{inspect(opts)}")
|
|
25
|
+
|
|
26
|
+
{:ok, pid} = GenServer.start_link(__MODULE__, opts, hibernate_after: 0)
|
|
27
|
+
|
|
28
|
+
GenServer.cast(pid, opts)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
@doc """
|
|
32
|
+
A crawl workflow that delegates responsibilities to:
|
|
33
|
+
|
|
34
|
+
- `Crawler.Fetcher.fetch/1`
|
|
35
|
+
- `Crawler.Parser.parse/1` (or a custom parser)
|
|
36
|
+
"""
|
|
37
|
+
def handle_cast(_req, state) do
|
|
38
|
+
Logger.debug("Running worker with opts: #{inspect(state)}")
|
|
39
|
+
|
|
40
|
+
state
|
|
41
|
+
|> Fetcher.fetch()
|
|
42
|
+
|> state[:parser].parse()
|
|
43
|
+
|> mark_processed()
|
|
44
|
+
|
|
45
|
+
{:noreply, state, :hibernate}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def handle_info(:stop, state) do
|
|
49
|
+
{:stop, :normal, state}
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def handle_info(_msg, state) do
|
|
53
|
+
{:noreply, state}
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
defp mark_processed({:ok, %Page{url: url, opts: opts}}) do
|
|
57
|
+
Store.ops_inc()
|
|
58
|
+
Store.processed({url, opts[:scope]})
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
defp mark_processed(_), do: nil
|
|
62
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
defmodule Crawler do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
A high performance web crawler in Elixir.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Options
|
|
7
|
+
alias Crawler.QueueHandler
|
|
8
|
+
alias Crawler.Store
|
|
9
|
+
alias Crawler.Worker
|
|
10
|
+
|
|
11
|
+
use Application
|
|
12
|
+
|
|
13
|
+
@doc """
|
|
14
|
+
Crawler is an application that gets started automatically with:
|
|
15
|
+
|
|
16
|
+
- a `Crawler.Store` that initiates a `Registry` for keeping internal data
|
|
17
|
+
"""
|
|
18
|
+
def start(_type, _args) do
|
|
19
|
+
children = [
|
|
20
|
+
Store,
|
|
21
|
+
{DynamicSupervisor, name: Crawler.QueueSupervisor, strategy: :one_for_one}
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
Supervisor.start_link(children, strategy: :one_for_one, name: Crawler)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
@doc """
|
|
28
|
+
Enqueues a crawl, via `Crawler.QueueHandler.enqueue/1`.
|
|
29
|
+
|
|
30
|
+
This is the default crawl behaviour as the queue determines when an actual
|
|
31
|
+
crawl should happen based on the available workers and the rate limit. The
|
|
32
|
+
queue kicks off `Crawler.Dispatcher.Worker` which in turn calls
|
|
33
|
+
`Crawler.crawl_now/1`.
|
|
34
|
+
"""
|
|
35
|
+
def crawl(url, opts \\ []) do
|
|
36
|
+
opts =
|
|
37
|
+
opts
|
|
38
|
+
|> Enum.into(%{})
|
|
39
|
+
|> Options.assign_defaults()
|
|
40
|
+
|> Options.assign_scope()
|
|
41
|
+
|> Options.assign_url(url)
|
|
42
|
+
|> Options.perform_default_actions()
|
|
43
|
+
|
|
44
|
+
if Store.ops_count() < opts[:max_pages] do
|
|
45
|
+
QueueHandler.enqueue(opts)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
@doc """
|
|
50
|
+
Stops the crawler.
|
|
51
|
+
"""
|
|
52
|
+
def stop(opts) do
|
|
53
|
+
Process.flag(:trap_exit, true)
|
|
54
|
+
OPQ.stop(opts[:queue])
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
@doc """
|
|
58
|
+
Pauses the crawler.
|
|
59
|
+
"""
|
|
60
|
+
def pause(opts), do: OPQ.pause(opts[:queue])
|
|
61
|
+
|
|
62
|
+
@doc """
|
|
63
|
+
Resumes the crawler after it was paused.
|
|
64
|
+
"""
|
|
65
|
+
def resume(opts), do: OPQ.resume(opts[:queue])
|
|
66
|
+
|
|
67
|
+
@doc """
|
|
68
|
+
Checks whether the crawler is still crawling.
|
|
69
|
+
"""
|
|
70
|
+
def running?(opts) do
|
|
71
|
+
Process.sleep(10)
|
|
72
|
+
|
|
73
|
+
cond do
|
|
74
|
+
opts[:queue] |> OPQ.info() |> elem(0) == :paused -> false
|
|
75
|
+
Store.ops_count() <= 1 -> true
|
|
76
|
+
OPQ.queue(opts[:queue]) |> Enum.any?() -> true
|
|
77
|
+
true -> false
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
@doc """
|
|
82
|
+
Crawls immediately, this is used by `Crawler.Dispatcher.Worker.start_link/1`.
|
|
83
|
+
|
|
84
|
+
For general purpose use cases, always use `Crawler.crawl/2` instead.
|
|
85
|
+
"""
|
|
86
|
+
def crawl_now(opts) do
|
|
87
|
+
if Store.ops_count() < opts[:max_pages] do
|
|
88
|
+
Worker.run(opts)
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
package/crawler/mix.exs
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
defmodule Crawler.Mixfile do
|
|
2
|
+
use Mix.Project
|
|
3
|
+
|
|
4
|
+
@source_url "https://github.com/fredwu/crawler"
|
|
5
|
+
@version "1.5.0"
|
|
6
|
+
|
|
7
|
+
def project do
|
|
8
|
+
[
|
|
9
|
+
app: :crawler,
|
|
10
|
+
version: @version,
|
|
11
|
+
elixir: "~> 1.13",
|
|
12
|
+
elixirc_paths: elixirc_paths(Mix.env()),
|
|
13
|
+
package: package(),
|
|
14
|
+
name: "Crawler",
|
|
15
|
+
description: "A high performance web crawler in Elixir.",
|
|
16
|
+
start_permanent: Mix.env() == :prod,
|
|
17
|
+
deps: deps(),
|
|
18
|
+
docs: docs(),
|
|
19
|
+
test_coverage: [tool: ExCoveralls],
|
|
20
|
+
preferred_cli_env: [coveralls: :test],
|
|
21
|
+
aliases: [publish: ["hex.publish", &git_tag/1]],
|
|
22
|
+
dialyzer: [
|
|
23
|
+
plt_add_apps: [:crawler],
|
|
24
|
+
flags: [:error_handling, :race_conditions, :underspecs]
|
|
25
|
+
]
|
|
26
|
+
]
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def application do
|
|
30
|
+
[
|
|
31
|
+
mod: {Crawler, []},
|
|
32
|
+
extra_applications: [:logger, :runtime_tools, :observer, :wx]
|
|
33
|
+
]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
defp elixirc_paths(:test), do: ["lib", "test/support"]
|
|
37
|
+
defp elixirc_paths(:dev), do: ["lib", "examples"]
|
|
38
|
+
defp elixirc_paths(_), do: ["lib"]
|
|
39
|
+
|
|
40
|
+
defp deps do
|
|
41
|
+
[
|
|
42
|
+
{:httpoison, "~> 2.1"},
|
|
43
|
+
{:floki, "~> 0.30"},
|
|
44
|
+
{:opq, "~> 4.0"},
|
|
45
|
+
{:retry, "~> 0.10"},
|
|
46
|
+
{:recode, "~> 0.6", only: :dev},
|
|
47
|
+
{:ex_doc, ">= 0.0.0", only: :dev},
|
|
48
|
+
{:dialyxir, "~> 1.1", only: [:dev, :test], runtime: false},
|
|
49
|
+
{:plug_cowboy, "~> 2.0", only: :test},
|
|
50
|
+
{:bypass, "~> 2.1", only: :test},
|
|
51
|
+
{:excoveralls, "~> 0.7", only: :test}
|
|
52
|
+
]
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
defp package do
|
|
56
|
+
[
|
|
57
|
+
maintainers: ["Fred Wu"],
|
|
58
|
+
licenses: ["MIT"],
|
|
59
|
+
links: %{"GitHub" => @source_url}
|
|
60
|
+
]
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
defp git_tag(_args) do
|
|
64
|
+
System.cmd("git", ["tag", "v" <> Mix.Project.config()[:version]])
|
|
65
|
+
System.cmd("git", ["push"])
|
|
66
|
+
System.cmd("git", ["push", "--tags"])
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
defp docs do
|
|
70
|
+
[
|
|
71
|
+
extras: ["CHANGELOG.md": [title: "Changelog"], "README.md": [title: "Overview"]],
|
|
72
|
+
main: "readme",
|
|
73
|
+
source_url: @source_url,
|
|
74
|
+
source_ref: "v#{@version}",
|
|
75
|
+
formatters: ["html"]
|
|
76
|
+
]
|
|
77
|
+
end
|
|
78
|
+
end
|
package/crawler/mix.lock
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
%{
|
|
2
|
+
"bunt": {:hex, :bunt, "0.2.1", "e2d4792f7bc0ced7583ab54922808919518d0e57ee162901a16a1b6664ef3b14", [:mix], [], "hexpm", "a330bfb4245239787b15005e66ae6845c9cd524a288f0d141c148b02603777a5"},
|
|
3
|
+
"bypass": {:hex, :bypass, "2.1.0", "909782781bf8e20ee86a9cabde36b259d44af8b9f38756173e8f5e2e1fabb9b1", [:mix], [{:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.0", [hex: :plug_cowboy, repo: "hexpm", optional: false]}, {:ranch, "~> 1.3", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "d9b5df8fa5b7a6efa08384e9bbecfe4ce61c77d28a4282f79e02f1ef78d96b80"},
|
|
4
|
+
"certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"},
|
|
5
|
+
"cowboy": {:hex, :cowboy, "2.10.0", "ff9ffeff91dae4ae270dd975642997afe2a1179d94b1887863e43f681a203e26", [:make, :rebar3], [{:cowlib, "2.12.1", [hex: :cowlib, repo: "hexpm", optional: false]}, {:ranch, "1.8.0", [hex: :ranch, repo: "hexpm", optional: false]}], "hexpm", "3afdccb7183cc6f143cb14d3cf51fa00e53db9ec80cdcd525482f5e99bc41d6b"},
|
|
6
|
+
"cowboy_telemetry": {:hex, :cowboy_telemetry, "0.4.0", "f239f68b588efa7707abce16a84d0d2acf3a0f50571f8bb7f56a15865aae820c", [:rebar3], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:telemetry, "~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "7d98bac1ee4565d31b62d59f8823dfd8356a169e7fcbb83831b8a5397404c9de"},
|
|
7
|
+
"cowlib": {:hex, :cowlib, "2.12.1", "a9fa9a625f1d2025fe6b462cb865881329b5caff8f1854d1cbc9f9533f00e1e1", [:make, :rebar3], [], "hexpm", "163b73f6367a7341b33c794c4e88e7dbfe6498ac42dcd69ef44c5bc5507c8db0"},
|
|
8
|
+
"dialyxir": {:hex, :dialyxir, "1.4.1", "a22ed1e7bd3a3e3f197b68d806ef66acb61ee8f57b3ac85fc5d57354c5482a93", [:mix], [{:erlex, ">= 0.2.6", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "84b795d6d7796297cca5a3118444b80c7d94f7ce247d49886e7c291e1ae49801"},
|
|
9
|
+
"earmark_parser": {:hex, :earmark_parser, "1.4.36", "487ea8ef9bdc659f085e6e654f3c3feea1d36ac3943edf9d2ef6c98de9174c13", [:mix], [], "hexpm", "a524e395634bdcf60a616efe77fd79561bec2e930d8b82745df06ab4e844400a"},
|
|
10
|
+
"erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"},
|
|
11
|
+
"ex_doc": {:hex, :ex_doc, "0.30.6", "5f8b54854b240a2b55c9734c4b1d0dd7bdd41f71a095d42a70445c03cf05a281", [:mix], [{:earmark_parser, "~> 1.4.31", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "bd48f2ddacf4e482c727f9293d9498e0881597eae6ddc3d9562bd7923375109f"},
|
|
12
|
+
"excoveralls": {:hex, :excoveralls, "0.17.1", "83fa7906ef23aa7fc8ad7ee469c357a63b1b3d55dd701ff5b9ce1f72442b2874", [:mix], [{:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "95bc6fda953e84c60f14da4a198880336205464e75383ec0f570180567985ae0"},
|
|
13
|
+
"floki": {:hex, :floki, "0.34.3", "5e2dcaec5d7c228ce5b1d3501502e308b2d79eb655e4191751a1fe491c37feac", [:mix], [], "hexpm", "9577440eea5b97924b4bf3c7ea55f7b8b6dce589f9b28b096cc294a8dc342341"},
|
|
14
|
+
"gen_stage": {:hex, :gen_stage, "1.2.1", "19d8b5e9a5996d813b8245338a28246307fd8b9c99d1237de199d21efc4c76a1", [:mix], [], "hexpm", "83e8be657fa05b992ffa6ac1e3af6d57aa50aace8f691fcf696ff02f8335b001"},
|
|
15
|
+
"glob_ex": {:hex, :glob_ex, "0.1.4", "fc69cb3f6df9138a1e36e9aa041ef2eab0d4dfe916331425f6bac290d1977e79", [:mix], [], "hexpm", "583d35559dc5b17f14612f7153aaaf6dcc13edf2e383126e2dfb5f2d19c78b89"},
|
|
16
|
+
"hackney": {:hex, :hackney, "1.19.1", "59de4716e985dd2b5cbd4954fa1ae187e2b610a9c4520ffcb0b1653c3d6e5559", [:rebar3], [{:certifi, "~> 2.12.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "8aa08234bdefc269995c63c2282cf3cd0e36febe3a6bfab11b610572fdd1cad0"},
|
|
17
|
+
"httpoison": {:hex, :httpoison, "2.1.0", "655fd9a7b0b95ee3e9a3b535cf7ac8e08ef5229bab187fa86ac4208b122d934b", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "fc455cb4306b43827def4f57299b2d5ac8ac331cb23f517e734a4b78210a160c"},
|
|
18
|
+
"idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"},
|
|
19
|
+
"jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"},
|
|
20
|
+
"makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"},
|
|
21
|
+
"makeup_elixir": {:hex, :makeup_elixir, "0.16.1", "cc9e3ca312f1cfeccc572b37a09980287e243648108384b97ff2b76e505c3555", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "e127a341ad1b209bd80f7bd1620a15693a9908ed780c3b763bccf7d200c767c6"},
|
|
22
|
+
"makeup_erlang": {:hex, :makeup_erlang, "0.1.2", "ad87296a092a46e03b7e9b0be7631ddcf64c790fa68a9ef5323b6cbb36affc72", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "f3f5a1ca93ce6e092d92b6d9c049bcda58a3b617a8d888f8e7231c85630e8108"},
|
|
23
|
+
"metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"},
|
|
24
|
+
"mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"},
|
|
25
|
+
"mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"},
|
|
26
|
+
"nimble_parsec": {:hex, :nimble_parsec, "1.3.1", "2c54013ecf170e249e9291ed0a62e5832f70a476c61da16f6aac6dca0189f2af", [:mix], [], "hexpm", "2682e3c0b2eb58d90c6375fc0cc30bc7be06f365bf72608804fb9cffa5e1b167"},
|
|
27
|
+
"opq": {:hex, :opq, "4.0.3", "04fd4bc42d8de8ea0175a1758d2f88fd22c2ead5342cbcb7777d899ba7c8b44f", [:mix], [{:gen_stage, "~> 1.1", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "78cb240d11ceeb72008df5613d59ebdc1dc7c1a896a35a500c59075b0931f9c8"},
|
|
28
|
+
"parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"},
|
|
29
|
+
"plug": {:hex, :plug, "1.14.2", "cff7d4ec45b4ae176a227acd94a7ab536d9b37b942c8e8fa6dfc0fff98ff4d80", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "842fc50187e13cf4ac3b253d47d9474ed6c296a8732752835ce4a86acdf68d13"},
|
|
30
|
+
"plug_cowboy": {:hex, :plug_cowboy, "2.6.1", "9a3bbfceeb65eff5f39dab529e5cd79137ac36e913c02067dba3963a26efe9b2", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:cowboy_telemetry, "~> 0.3", [hex: :cowboy_telemetry, repo: "hexpm", optional: false]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "de36e1a21f451a18b790f37765db198075c25875c64834bcc82d90b309eb6613"},
|
|
31
|
+
"plug_crypto": {:hex, :plug_crypto, "1.2.5", "918772575e48e81e455818229bf719d4ab4181fcbf7f85b68a35620f78d89ced", [:mix], [], "hexpm", "26549a1d6345e2172eb1c233866756ae44a9609bd33ee6f99147ab3fd87fd842"},
|
|
32
|
+
"ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"},
|
|
33
|
+
"recode": {:hex, :recode, "0.6.4", "a3370bda63376953fb5e4698593c64388f27efd60f6b06763dbcd37e869c159e", [:mix], [{:bunt, "~> 0.2", [hex: :bunt, repo: "hexpm", optional: false]}, {:glob_ex, "~> 0.1", [hex: :glob_ex, repo: "hexpm", optional: false]}, {:rewrite, "~> 0.9", [hex: :rewrite, repo: "hexpm", optional: false]}], "hexpm", "46700acb62d1f585a25cd3c3f5b19377911911d34107c9a5879d5e0bc6544995"},
|
|
34
|
+
"retry": {:hex, :retry, "0.18.0", "dc58ebe22c95aa00bc2459f9e0c5400e6005541cf8539925af0aa027dc860543", [:mix], [], "hexpm", "9483959cc7bf69c9e576d9dfb2b678b71c045d3e6f39ab7c9aa1489df4492d73"},
|
|
35
|
+
"rewrite": {:hex, :rewrite, "0.9.0", "90f2108ee657705bea00fa30d56dc26b8113ddfe1481487b8f6687871316a131", [:mix], [{:glob_ex, "~> 0.1", [hex: :glob_ex, repo: "hexpm", optional: false]}, {:sourceror, "~> 0.13", [hex: :sourceror, repo: "hexpm", optional: false]}], "hexpm", "5ee26ba5ab0ae3c1155b2ba8093d2bbf78346b85c8493bc0bb4b49a3d6b3330f"},
|
|
36
|
+
"sourceror": {:hex, :sourceror, "0.14.0", "b6b8552d0240400d66b6f107c1bab7ac1726e998efc797f178b7b517e928e314", [:mix], [], "hexpm", "809c71270ad48092d40bbe251a133e49ae229433ce103f762a2373b7a10a8d8b"},
|
|
37
|
+
"ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"},
|
|
38
|
+
"telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"},
|
|
39
|
+
"unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"},
|
|
40
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
defmodule IntegrationTest do
|
|
2
|
+
use Crawler.TestCase, async: false
|
|
3
|
+
|
|
4
|
+
import Plug.Conn
|
|
5
|
+
|
|
6
|
+
@moduletag capture_log: true
|
|
7
|
+
|
|
8
|
+
test "integration", %{
|
|
9
|
+
bypass: bypass,
|
|
10
|
+
url: url,
|
|
11
|
+
path: path,
|
|
12
|
+
bypass2: bypass2,
|
|
13
|
+
url2: url2,
|
|
14
|
+
path2: path2
|
|
15
|
+
} do
|
|
16
|
+
linked_url1 = "#{url}/page1.html"
|
|
17
|
+
linked_url2 = "#{url}/dir/page2.html"
|
|
18
|
+
linked_url3 = "#{url2}/page3.html"
|
|
19
|
+
|
|
20
|
+
page1_raw = "<html><a href='#{linked_url2}'>2</a> <a href='#{linked_url3}'>3</a></html>"
|
|
21
|
+
page2_raw = "<html><a href='#{linked_url3}'>3</a></html>"
|
|
22
|
+
page3_raw = "<html><a href='dir/page4'>4</a> <a href='/dir/page4'>4</a></html>"
|
|
23
|
+
|
|
24
|
+
page4_raw =
|
|
25
|
+
"<html><head><script type='text/javascript' src='/javascript.js' /><link rel='stylesheet' href='../styles.css' /></head><a href='../page5.html'>5</a> <img src='../image1.png' /></html>"
|
|
26
|
+
|
|
27
|
+
page5_raw = "<html><a href='/page6'>6</a> <img src='/image2.png' /></html>"
|
|
28
|
+
css_raw = "img { url(image3.png); }"
|
|
29
|
+
|
|
30
|
+
Bypass.expect_once(
|
|
31
|
+
bypass,
|
|
32
|
+
"GET",
|
|
33
|
+
"/page1.html",
|
|
34
|
+
&(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page1_raw))
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
Bypass.expect_once(
|
|
38
|
+
bypass,
|
|
39
|
+
"GET",
|
|
40
|
+
"/dir/page2.html",
|
|
41
|
+
&(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page2_raw))
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
Bypass.expect_once(
|
|
45
|
+
bypass2,
|
|
46
|
+
"GET",
|
|
47
|
+
"/page3.html",
|
|
48
|
+
&(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page3_raw))
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
Bypass.expect_once(
|
|
52
|
+
bypass2,
|
|
53
|
+
"GET",
|
|
54
|
+
"/dir/page4",
|
|
55
|
+
&(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page4_raw))
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
Bypass.expect_once(
|
|
59
|
+
bypass2,
|
|
60
|
+
"GET",
|
|
61
|
+
"/page5.html",
|
|
62
|
+
&(&1 |> put_resp_header("content-type", "text/html") |> resp(200, page5_raw))
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
Bypass.expect_once(
|
|
66
|
+
bypass2,
|
|
67
|
+
"GET",
|
|
68
|
+
"/image1.png",
|
|
69
|
+
&(&1 |> put_resp_header("content-type", "image/png") |> resp(200, "png"))
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
Bypass.expect_once(
|
|
73
|
+
bypass2,
|
|
74
|
+
"GET",
|
|
75
|
+
"/image2.png",
|
|
76
|
+
&(&1 |> put_resp_header("content-type", "image/png") |> resp(200, "png"))
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
Bypass.expect_once(
|
|
80
|
+
bypass2,
|
|
81
|
+
"GET",
|
|
82
|
+
"/image3.png",
|
|
83
|
+
&(&1 |> put_resp_header("content-type", "image/png") |> resp(200, "png"))
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
Bypass.expect_once(
|
|
87
|
+
bypass2,
|
|
88
|
+
"GET",
|
|
89
|
+
"/styles.css",
|
|
90
|
+
&(&1 |> put_resp_header("content-type", "text/css") |> resp(200, css_raw))
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
Bypass.expect_once(
|
|
94
|
+
bypass2,
|
|
95
|
+
"GET",
|
|
96
|
+
"/javascript.js",
|
|
97
|
+
&(&1 |> put_resp_header("content-type", "application/javascript") |> resp(200, "js"))
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
Crawler.crawl(linked_url1,
|
|
101
|
+
save_to: tmp("integration"),
|
|
102
|
+
max_depths: 4,
|
|
103
|
+
assets: ["js", "css", "images"]
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
page1 =
|
|
107
|
+
"<html><a href='../#{path}/dir/page2.html'>2</a> <a href='../#{path2}/page3.html'>3</a></html>"
|
|
108
|
+
|
|
109
|
+
page2 = "<html><a href='../../#{path2}/page3.html'>3</a></html>"
|
|
110
|
+
|
|
111
|
+
page3 =
|
|
112
|
+
"<html><a href='../#{path2}/dir/page4/index.html'>4</a> <a href='../#{path2}/dir/page4/index.html'>4</a></html>"
|
|
113
|
+
|
|
114
|
+
page4 =
|
|
115
|
+
"<html><head><script type='text/javascript' src='../../../#{path2}/javascript.js' /><link rel='stylesheet' href='../../../#{path2}/styles.css' /></head><a href='../../../#{path2}/page5.html'>5</a> <img src='../../../#{path2}/image1.png' /></html>"
|
|
116
|
+
|
|
117
|
+
page5 =
|
|
118
|
+
"<html><a href='../#{path2}/page6/index.html'>6</a> <img src='../#{path2}/image2.png' /></html>"
|
|
119
|
+
|
|
120
|
+
css = "img { url(../#{path2}/image3.png); }"
|
|
121
|
+
|
|
122
|
+
wait(fn ->
|
|
123
|
+
assert {:ok, page1} == File.read(tmp("integration/#{path}", "page1.html"))
|
|
124
|
+
assert {:ok, page2} == File.read(tmp("integration/#{path}/dir", "page2.html"))
|
|
125
|
+
assert {:ok, page3} == File.read(tmp("integration/#{path2}", "page3.html"))
|
|
126
|
+
assert {:ok, page4} == File.read(tmp("integration/#{path2}/dir/page4", "index.html"))
|
|
127
|
+
assert {:ok, page5} == File.read(tmp("integration/#{path2}", "page5.html"))
|
|
128
|
+
assert {:ok, "png"} == File.read(tmp("integration/#{path2}", "image1.png"))
|
|
129
|
+
assert {:ok, "png"} == File.read(tmp("integration/#{path2}", "image2.png"))
|
|
130
|
+
assert {:ok, "png"} == File.read(tmp("integration/#{path2}", "image3.png"))
|
|
131
|
+
assert {:ok, css} == File.read(tmp("integration/#{path2}", "styles.css"))
|
|
132
|
+
assert {:ok, "js"} == File.read(tmp("integration/#{path2}", "javascript.js"))
|
|
133
|
+
end)
|
|
134
|
+
end
|
|
135
|
+
end
|