powerdlz23 1.2.3 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Spider/README.md +19 -0
- package/Spider/domain.py +18 -0
- package/Spider/general.py +51 -0
- package/Spider/link_finder.py +25 -0
- package/Spider/main.py +50 -0
- package/Spider/spider.py +74 -0
- package/crawler/.formatter.exs +5 -0
- package/crawler/.github/workflows/ci.yml +29 -0
- package/crawler/.recode.exs +33 -0
- package/crawler/.tool-versions +2 -0
- package/crawler/CHANGELOG.md +82 -0
- package/crawler/README.md +198 -0
- package/crawler/architecture.svg +4 -0
- package/crawler/config/config.exs +9 -0
- package/crawler/config/dev.exs +5 -0
- package/crawler/config/test.exs +5 -0
- package/crawler/examples/google_search/scraper.ex +37 -0
- package/crawler/examples/google_search/url_filter.ex +11 -0
- package/crawler/examples/google_search.ex +77 -0
- package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
- package/crawler/lib/crawler/dispatcher.ex +20 -0
- package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
- package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
- package/crawler/lib/crawler/fetcher/policer.ex +77 -0
- package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
- package/crawler/lib/crawler/fetcher/requester.ex +32 -0
- package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
- package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
- package/crawler/lib/crawler/fetcher.ex +81 -0
- package/crawler/lib/crawler/http.ex +7 -0
- package/crawler/lib/crawler/linker/path_builder.ex +71 -0
- package/crawler/lib/crawler/linker/path_expander.ex +59 -0
- package/crawler/lib/crawler/linker/path_finder.ex +106 -0
- package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
- package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
- package/crawler/lib/crawler/linker.ex +173 -0
- package/crawler/lib/crawler/options.ex +127 -0
- package/crawler/lib/crawler/parser/css_parser.ex +37 -0
- package/crawler/lib/crawler/parser/guarder.ex +38 -0
- package/crawler/lib/crawler/parser/html_parser.ex +41 -0
- package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
- package/crawler/lib/crawler/parser/link_parser.ex +50 -0
- package/crawler/lib/crawler/parser.ex +122 -0
- package/crawler/lib/crawler/queue_handler.ex +45 -0
- package/crawler/lib/crawler/scraper.ex +28 -0
- package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
- package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
- package/crawler/lib/crawler/snapper.ex +82 -0
- package/crawler/lib/crawler/store/counter.ex +19 -0
- package/crawler/lib/crawler/store/page.ex +7 -0
- package/crawler/lib/crawler/store.ex +87 -0
- package/crawler/lib/crawler/worker.ex +62 -0
- package/crawler/lib/crawler.ex +91 -0
- package/crawler/mix.exs +78 -0
- package/crawler/mix.lock +40 -0
- package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
- package/crawler/test/integration_test.exs +135 -0
- package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
- package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
- package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
- package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
- package/crawler/test/lib/crawler/http_test.exs +47 -0
- package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
- package/crawler/test/lib/crawler/linker_test.exs +7 -0
- package/crawler/test/lib/crawler/options_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser_test.exs +8 -0
- package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
- package/crawler/test/lib/crawler/scraper_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper_test.exs +9 -0
- package/crawler/test/lib/crawler/worker_test.exs +5 -0
- package/crawler/test/lib/crawler_test.exs +295 -0
- package/crawler/test/support/test_case.ex +24 -0
- package/crawler/test/support/test_helpers.ex +28 -0
- package/crawler/test/test_helper.exs +7 -0
- package/grell/.rspec +2 -0
- package/grell/.travis.yml +28 -0
- package/grell/CHANGELOG.md +111 -0
- package/grell/Gemfile +7 -0
- package/grell/LICENSE.txt +22 -0
- package/grell/README.md +213 -0
- package/grell/Rakefile +2 -0
- package/grell/grell.gemspec +36 -0
- package/grell/lib/grell/capybara_driver.rb +44 -0
- package/grell/lib/grell/crawler.rb +83 -0
- package/grell/lib/grell/crawler_manager.rb +84 -0
- package/grell/lib/grell/grell_logger.rb +10 -0
- package/grell/lib/grell/page.rb +275 -0
- package/grell/lib/grell/page_collection.rb +62 -0
- package/grell/lib/grell/rawpage.rb +62 -0
- package/grell/lib/grell/reader.rb +18 -0
- package/grell/lib/grell/version.rb +3 -0
- package/grell/lib/grell.rb +11 -0
- package/grell/spec/lib/capybara_driver_spec.rb +38 -0
- package/grell/spec/lib/crawler_manager_spec.rb +174 -0
- package/grell/spec/lib/crawler_spec.rb +361 -0
- package/grell/spec/lib/page_collection_spec.rb +159 -0
- package/grell/spec/lib/page_spec.rb +418 -0
- package/grell/spec/lib/reader_spec.rb +43 -0
- package/grell/spec/spec_helper.rb +66 -0
- package/heartmagic/config.py +1 -0
- package/heartmagic/heart.py +3 -0
- package/heartmagic/pytransform/__init__.py +483 -0
- package/heartmagic/pytransform/_pytransform.dll +0 -0
- package/heartmagic/pytransform/_pytransform.so +0 -0
- package/httpStatusCode/README.md +2 -0
- package/httpStatusCode/httpStatusCode.js +4 -0
- package/httpStatusCode/reasonPhrases.js +344 -0
- package/httpStatusCode/statusCodes.js +344 -0
- package/package.json +1 -1
- package/rubyretriever/.rspec +2 -0
- package/rubyretriever/.travis.yml +7 -0
- package/rubyretriever/Gemfile +3 -0
- package/rubyretriever/Gemfile.lock +64 -0
- package/rubyretriever/LICENSE +20 -0
- package/rubyretriever/Rakefile +7 -0
- package/rubyretriever/bin/rr +79 -0
- package/rubyretriever/lib/retriever/cli.rb +25 -0
- package/rubyretriever/lib/retriever/core_ext.rb +13 -0
- package/rubyretriever/lib/retriever/fetch.rb +268 -0
- package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
- package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
- package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
- package/rubyretriever/lib/retriever/link.rb +47 -0
- package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
- package/rubyretriever/lib/retriever/page.rb +104 -0
- package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
- package/rubyretriever/lib/retriever/target.rb +47 -0
- package/rubyretriever/lib/retriever/version.rb +4 -0
- package/rubyretriever/lib/retriever.rb +15 -0
- package/rubyretriever/readme.md +166 -0
- package/rubyretriever/rubyretriever.gemspec +41 -0
- package/rubyretriever/spec/link_spec.rb +77 -0
- package/rubyretriever/spec/page_spec.rb +94 -0
- package/rubyretriever/spec/retriever_spec.rb +84 -0
- package/rubyretriever/spec/spec_helper.rb +17 -0
- package/rubyretriever/spec/target_spec.rb +55 -0
- package/snapcrawl/.changelog.old.md +157 -0
- package/snapcrawl/.gitattributes +1 -0
- package/snapcrawl/.github/workflows/test.yml +41 -0
- package/snapcrawl/.rspec +3 -0
- package/snapcrawl/.rubocop.yml +23 -0
- package/snapcrawl/CHANGELOG.md +182 -0
- package/snapcrawl/Gemfile +15 -0
- package/snapcrawl/LICENSE +21 -0
- package/snapcrawl/README.md +135 -0
- package/snapcrawl/Runfile +35 -0
- package/snapcrawl/bin/snapcrawl +25 -0
- package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
- package/snapcrawl/lib/snapcrawl/config.rb +60 -0
- package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
- package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
- package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
- package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
- package/snapcrawl/lib/snapcrawl/page.rb +118 -0
- package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
- package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
- package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
- package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
- package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
- package/snapcrawl/lib/snapcrawl/version.rb +3 -0
- package/snapcrawl/lib/snapcrawl.rb +20 -0
- package/snapcrawl/snapcrawl.gemspec +27 -0
- package/snapcrawl/snapcrawl.yml +41 -0
- package/snapcrawl/spec/README.md +16 -0
- package/snapcrawl/spec/approvals/bin/help +26 -0
- package/snapcrawl/spec/approvals/bin/usage +4 -0
- package/snapcrawl/spec/approvals/cli/usage +4 -0
- package/snapcrawl/spec/approvals/config/defaults +15 -0
- package/snapcrawl/spec/approvals/config/minimal +15 -0
- package/snapcrawl/spec/approvals/integration/blacklist +14 -0
- package/snapcrawl/spec/approvals/integration/default-config +14 -0
- package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
- package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
- package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
- package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
- package/snapcrawl/spec/approvals/integration/whitelist +14 -0
- package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
- package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
- package/snapcrawl/spec/server/config.ru +97 -0
- package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
- package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
- package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
- package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
- package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
- package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
- package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
- package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
- package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
- package/snapcrawl/spec/spec_helper.rb +22 -0
- package/snapcrawl/spec/spec_mixin.rb +10 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
defmodule Crawler.Fetcher do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Fetches pages and perform tasks on them.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
require Logger
|
|
7
|
+
|
|
8
|
+
alias Crawler.Fetcher.HeaderPreparer
|
|
9
|
+
alias Crawler.Fetcher.Policer
|
|
10
|
+
alias Crawler.Fetcher.Recorder
|
|
11
|
+
alias Crawler.Fetcher.Requester
|
|
12
|
+
alias Crawler.Snapper
|
|
13
|
+
alias Crawler.Store.Page
|
|
14
|
+
|
|
15
|
+
@doc """
|
|
16
|
+
Fetches a URL by:
|
|
17
|
+
|
|
18
|
+
- verifying whether the URL needs fetching through `Crawler.Fetcher.Policer.police/1`
|
|
19
|
+
- recording data for internal use through `Crawler.Fetcher.Recorder.record/1`
|
|
20
|
+
- fetching the URL
|
|
21
|
+
- performing retries upon failed fetches through `Crawler.Fetcher.Retrier.perform/2`
|
|
22
|
+
"""
|
|
23
|
+
def fetch(opts) do
|
|
24
|
+
with {:ok, opts} <- Policer.police(opts),
|
|
25
|
+
{:ok, opts} <- Recorder.record(opts) do
|
|
26
|
+
opts[:retrier].perform(fn -> fetch_url(opts) end, opts)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
defp fetch_url(opts) do
|
|
31
|
+
case Requester.make(opts) do
|
|
32
|
+
{:ok, %HTTPoison.Response{status_code: 200, body: body, headers: headers}} ->
|
|
33
|
+
fetch_url_200(body, headers, opts)
|
|
34
|
+
|
|
35
|
+
{:ok, %HTTPoison.Response{status_code: status_code}} ->
|
|
36
|
+
fetch_url_non_200(status_code, opts)
|
|
37
|
+
|
|
38
|
+
{:error, %HTTPoison.Error{reason: reason}} ->
|
|
39
|
+
fetch_url_failed(reason, opts)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
defp fetch_url_200(body, headers, opts) do
|
|
44
|
+
with opts <- HeaderPreparer.prepare(headers, opts),
|
|
45
|
+
{:ok, _} <- Recorder.maybe_store_page(body, opts),
|
|
46
|
+
{:ok, opts} <- record_referrer_url(opts),
|
|
47
|
+
{:ok, _} <- snap_page(body, opts) do
|
|
48
|
+
Logger.debug("Fetched #{opts[:url]}")
|
|
49
|
+
|
|
50
|
+
%Page{url: opts[:url], body: body, opts: opts}
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
defp fetch_url_non_200(status_code, opts) do
|
|
55
|
+
msg = "Failed to fetch #{opts[:url]}, status code: #{status_code}"
|
|
56
|
+
|
|
57
|
+
Logger.debug(msg)
|
|
58
|
+
|
|
59
|
+
{:warn, msg}
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
defp fetch_url_failed(reason, opts) do
|
|
63
|
+
msg = "Failed to fetch #{opts[:url]}, reason: #{inspect(reason)}"
|
|
64
|
+
|
|
65
|
+
Logger.debug(msg)
|
|
66
|
+
|
|
67
|
+
{:warn, msg}
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
defp record_referrer_url(opts) do
|
|
71
|
+
{:ok, Map.put(opts, :referrer_url, opts[:url])}
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
defp snap_page(body, opts) do
|
|
75
|
+
if opts[:save_to] do
|
|
76
|
+
Snapper.snap(body, opts)
|
|
77
|
+
else
|
|
78
|
+
{:ok, ""}
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
defmodule Crawler.Linker.PathBuilder do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Builds a path for a link (can be a URL itself or a relative link) based on
|
|
4
|
+
the input string which is a URL with or without its protocol.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
alias Crawler.Linker.PathExpander
|
|
8
|
+
alias Crawler.Linker.PathFinder
|
|
9
|
+
|
|
10
|
+
@doc """
|
|
11
|
+
Builds a path for a link (can be a URL itself or a relative link) based on
|
|
12
|
+
the input string which is a URL with or without its protocol.
|
|
13
|
+
|
|
14
|
+
## Examples
|
|
15
|
+
|
|
16
|
+
iex> PathBuilder.build_path(
|
|
17
|
+
iex> "https://cool.beans:7777/dir/page1",
|
|
18
|
+
iex> "https://hello.world:8888/remote/page2"
|
|
19
|
+
iex> )
|
|
20
|
+
"hello.world-8888/remote/page2"
|
|
21
|
+
|
|
22
|
+
iex> PathBuilder.build_path(
|
|
23
|
+
iex> "https://cool.beans:7777/dir/page1",
|
|
24
|
+
iex> "local/page2"
|
|
25
|
+
iex> )
|
|
26
|
+
"cool.beans-7777/dir/local/page2"
|
|
27
|
+
|
|
28
|
+
iex> PathBuilder.build_path(
|
|
29
|
+
iex> "https://cool.beans:7777/dir/page1",
|
|
30
|
+
iex> "/local/page2"
|
|
31
|
+
iex> )
|
|
32
|
+
"cool.beans-7777/local/page2"
|
|
33
|
+
|
|
34
|
+
iex> PathBuilder.build_path(
|
|
35
|
+
iex> "https://cool.beans:7777/parent/dir/page1",
|
|
36
|
+
iex> "../local/page2"
|
|
37
|
+
iex> )
|
|
38
|
+
"cool.beans-7777/parent/local/page2"
|
|
39
|
+
|
|
40
|
+
iex> PathBuilder.build_path(
|
|
41
|
+
iex> "https://cool.beans:7777/parent/dir/page1",
|
|
42
|
+
iex> "../../local/page2"
|
|
43
|
+
iex> )
|
|
44
|
+
"cool.beans-7777/local/page2"
|
|
45
|
+
"""
|
|
46
|
+
def build_path(current_url, link, safe \\ true) do
|
|
47
|
+
current_url
|
|
48
|
+
|> base_path(link, safe)
|
|
49
|
+
|> build(link, safe)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
defp base_path(url, "/" <> _link, safe), do: PathFinder.find_domain(url, safe)
|
|
53
|
+
defp base_path(url, _link, safe), do: PathFinder.find_base_path(url, safe)
|
|
54
|
+
|
|
55
|
+
defp build(path, link, safe) do
|
|
56
|
+
link
|
|
57
|
+
|> normalise(path)
|
|
58
|
+
|> PathFinder.find_path(safe)
|
|
59
|
+
|> PathExpander.expand_dot()
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
defp normalise(link, path) do
|
|
63
|
+
link
|
|
64
|
+
|> String.split("://", parts: 2)
|
|
65
|
+
|> Enum.count()
|
|
66
|
+
|> join_path(link, path)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
defp join_path(2, link, _path), do: link
|
|
70
|
+
defp join_path(1, link, path), do: Path.join(path, link)
|
|
71
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
defmodule Crawler.Linker.PathExpander do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Expands the path by expanding any `.` and `..` characters.
|
|
4
|
+
|
|
5
|
+
See [this pull request](https://github.com/elixir-lang/elixir/pull/6486).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
@doc """
|
|
9
|
+
Expands the path by expanding any `.` and `..` characters.
|
|
10
|
+
|
|
11
|
+
Note this function only expands any `.` and `..` characters within the given
|
|
12
|
+
path, and it does not take into account the absolute or relative nature of
|
|
13
|
+
the path itself, for that please use `expand/1` or `expand/2`.
|
|
14
|
+
|
|
15
|
+
## Examples
|
|
16
|
+
|
|
17
|
+
### Intended use case
|
|
18
|
+
|
|
19
|
+
iex> PathExpander.expand_dot("foo/bar/../baz")
|
|
20
|
+
"foo/baz"
|
|
21
|
+
|
|
22
|
+
iex> PathExpander.expand_dot("/foo/bar/../baz")
|
|
23
|
+
"/foo/baz"
|
|
24
|
+
|
|
25
|
+
### Non-intended use cases are ignored
|
|
26
|
+
|
|
27
|
+
iex> PathExpander.expand_dot("foo/bar/./baz")
|
|
28
|
+
"foo/bar/baz"
|
|
29
|
+
|
|
30
|
+
iex> PathExpander.expand_dot("../foo/bar")
|
|
31
|
+
"foo/bar"
|
|
32
|
+
"""
|
|
33
|
+
def expand_dot(<<"/", rest::binary>>),
|
|
34
|
+
do: "/" <> do_expand_dot(rest)
|
|
35
|
+
|
|
36
|
+
def expand_dot(path),
|
|
37
|
+
do: do_expand_dot(path)
|
|
38
|
+
|
|
39
|
+
defp do_expand_dot(path),
|
|
40
|
+
do: do_expand_dot(:binary.split(path, "/", [:global]), [])
|
|
41
|
+
|
|
42
|
+
defp do_expand_dot([".." | t], [_, _ | acc]),
|
|
43
|
+
do: do_expand_dot(t, acc)
|
|
44
|
+
|
|
45
|
+
defp do_expand_dot([".." | t], []),
|
|
46
|
+
do: do_expand_dot(t, [])
|
|
47
|
+
|
|
48
|
+
defp do_expand_dot(["." | t], acc),
|
|
49
|
+
do: do_expand_dot(t, acc)
|
|
50
|
+
|
|
51
|
+
defp do_expand_dot([h | t], acc),
|
|
52
|
+
do: do_expand_dot(t, ["/", h | acc])
|
|
53
|
+
|
|
54
|
+
defp do_expand_dot([], []),
|
|
55
|
+
do: ""
|
|
56
|
+
|
|
57
|
+
defp do_expand_dot([], ["/" | acc]),
|
|
58
|
+
do: IO.iodata_to_binary(:lists.reverse(acc))
|
|
59
|
+
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
defmodule Crawler.Linker.PathFinder do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Finds different components of a given URL, e.g. its domain name, directory
|
|
4
|
+
path, or full path.
|
|
5
|
+
|
|
6
|
+
The `safe` option in some the functions indicates whether the return value
|
|
7
|
+
should be transformed in order to be safely used as folder and file names.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
@doc """
|
|
11
|
+
Finds the URL scheme (e.g. `https://`).
|
|
12
|
+
|
|
13
|
+
## Examples
|
|
14
|
+
|
|
15
|
+
iex> PathFinder.find_scheme("http://hi.hello")
|
|
16
|
+
"http://"
|
|
17
|
+
|
|
18
|
+
iex> PathFinder.find_scheme("https://hi.hello:8888/")
|
|
19
|
+
"https://"
|
|
20
|
+
"""
|
|
21
|
+
def find_scheme(url) do
|
|
22
|
+
(url
|
|
23
|
+
|> String.split("://", part: 2)
|
|
24
|
+
|> Kernel.hd()) <> "://"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
@doc """
|
|
28
|
+
Finds the domain name with port number (e.g. `example.org:8080`).
|
|
29
|
+
|
|
30
|
+
## Examples
|
|
31
|
+
|
|
32
|
+
iex> PathFinder.find_domain("http://hi.hello")
|
|
33
|
+
"hi.hello"
|
|
34
|
+
|
|
35
|
+
iex> PathFinder.find_domain("https://hi.hello:8888/world")
|
|
36
|
+
"hi.hello-8888"
|
|
37
|
+
|
|
38
|
+
iex> PathFinder.find_domain("https://hi.hello:8888/world", false)
|
|
39
|
+
"hi.hello:8888"
|
|
40
|
+
"""
|
|
41
|
+
def find_domain(url, safe \\ true) do
|
|
42
|
+
url
|
|
43
|
+
|> find_path(safe)
|
|
44
|
+
|> String.split("/", parts: 2)
|
|
45
|
+
|> Kernel.hd()
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
@doc """
|
|
49
|
+
Finds the base path of a given page.
|
|
50
|
+
|
|
51
|
+
## Examples
|
|
52
|
+
|
|
53
|
+
iex> PathFinder.find_base_path("http://hi.hello")
|
|
54
|
+
"hi.hello"
|
|
55
|
+
|
|
56
|
+
iex> PathFinder.find_base_path("https://hi.hello:8888/dir/world")
|
|
57
|
+
"hi.hello-8888/dir"
|
|
58
|
+
|
|
59
|
+
iex> PathFinder.find_base_path("https://hi.hello:8888/dir/world", false)
|
|
60
|
+
"hi.hello:8888/dir"
|
|
61
|
+
"""
|
|
62
|
+
def find_base_path(url, safe \\ true) do
|
|
63
|
+
url
|
|
64
|
+
|> find_path(safe)
|
|
65
|
+
|> String.split("/")
|
|
66
|
+
|> base_path()
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
defp base_path([path]), do: path
|
|
70
|
+
|
|
71
|
+
defp base_path(list) do
|
|
72
|
+
[_head | tail] = Enum.reverse(list)
|
|
73
|
+
|
|
74
|
+
tail
|
|
75
|
+
|> Enum.reverse()
|
|
76
|
+
|> Path.join()
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
@doc """
|
|
80
|
+
Finds the full path of a given page.
|
|
81
|
+
|
|
82
|
+
## Examples
|
|
83
|
+
|
|
84
|
+
iex> PathFinder.find_path("http://hi.hello")
|
|
85
|
+
"hi.hello"
|
|
86
|
+
|
|
87
|
+
iex> PathFinder.find_path("https://hi.hello:8888/world")
|
|
88
|
+
"hi.hello-8888/world"
|
|
89
|
+
|
|
90
|
+
iex> PathFinder.find_path("https://hi.hello:8888/world", false)
|
|
91
|
+
"hi.hello:8888/world"
|
|
92
|
+
"""
|
|
93
|
+
def find_path(url, safe \\ true)
|
|
94
|
+
|
|
95
|
+
def find_path(url, false) do
|
|
96
|
+
url
|
|
97
|
+
|> String.split("://", parts: 2)
|
|
98
|
+
|> Enum.at(-1)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def find_path(url, true) do
|
|
102
|
+
url
|
|
103
|
+
|> find_path(false)
|
|
104
|
+
|> String.replace(":", "-")
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
defmodule Crawler.Linker.PathOffliner do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Transforms a link to be storable and linkable offline.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Linker.PathFinder
|
|
7
|
+
|
|
8
|
+
@doc """
|
|
9
|
+
Transforms a given link so that it can be stored and linked to by other pages.
|
|
10
|
+
|
|
11
|
+
When a page does not have a file extension (e.g. html) it is treated as the
|
|
12
|
+
index page for a directory, therefore `index.html` is appended to the link.
|
|
13
|
+
|
|
14
|
+
## Examples
|
|
15
|
+
|
|
16
|
+
iex> PathOffliner.transform("http://hello.world")
|
|
17
|
+
"http://hello.world/index.html"
|
|
18
|
+
|
|
19
|
+
iex> PathOffliner.transform("hello.world")
|
|
20
|
+
"hello.world/index.html"
|
|
21
|
+
|
|
22
|
+
iex> PathOffliner.transform("hello.world/")
|
|
23
|
+
"hello.world/index.html"
|
|
24
|
+
|
|
25
|
+
iex> PathOffliner.transform("hello/world")
|
|
26
|
+
"hello/world/index.html"
|
|
27
|
+
|
|
28
|
+
iex> PathOffliner.transform("hello/world.html")
|
|
29
|
+
"hello/world.html"
|
|
30
|
+
"""
|
|
31
|
+
def transform(link) do
|
|
32
|
+
link
|
|
33
|
+
|> PathFinder.find_path()
|
|
34
|
+
|> String.split("/", trim: true)
|
|
35
|
+
|> Enum.count()
|
|
36
|
+
|> last_segment(link)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
defp last_segment(1, link) do
|
|
40
|
+
transform_link(false, link)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
defp last_segment(_count, link) do
|
|
44
|
+
link
|
|
45
|
+
|> String.split("/")
|
|
46
|
+
|> Enum.take(-1)
|
|
47
|
+
|> Kernel.hd()
|
|
48
|
+
|> transform_segment(link)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
defp transform_segment(segment, link) do
|
|
52
|
+
segment
|
|
53
|
+
|> String.contains?(".")
|
|
54
|
+
|> transform_link(link)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
defp transform_link(true, link), do: link
|
|
58
|
+
defp transform_link(false, link), do: Path.join(link, "index.html")
|
|
59
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
defmodule Crawler.Linker.PathPrefixer do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Returns prefixes (`../`s) according to the given URL's structure.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Linker.PathFinder
|
|
7
|
+
alias Crawler.Linker.PathOffliner
|
|
8
|
+
|
|
9
|
+
@doc """
|
|
10
|
+
Returns prefixes (`../`s) according to the given URL's structure.
|
|
11
|
+
|
|
12
|
+
## Examples
|
|
13
|
+
|
|
14
|
+
iex> PathPrefixer.prefix("https://hello.world/")
|
|
15
|
+
"../"
|
|
16
|
+
|
|
17
|
+
iex> PathPrefixer.prefix("https://hello.world/page.html")
|
|
18
|
+
"../"
|
|
19
|
+
|
|
20
|
+
iex> PathPrefixer.prefix("https://hello.world/page")
|
|
21
|
+
"../../"
|
|
22
|
+
|
|
23
|
+
iex> PathPrefixer.prefix("https://hello.world/dir/page.html")
|
|
24
|
+
"../../"
|
|
25
|
+
|
|
26
|
+
iex> PathPrefixer.prefix("https://hello.world/dir/page")
|
|
27
|
+
"../../../"
|
|
28
|
+
"""
|
|
29
|
+
def prefix(current_url) do
|
|
30
|
+
current_url
|
|
31
|
+
|> PathFinder.find_path()
|
|
32
|
+
|> PathOffliner.transform()
|
|
33
|
+
|> count_depth()
|
|
34
|
+
|> make_prefix()
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
defp count_depth(string, token \\ "/") do
|
|
38
|
+
(string
|
|
39
|
+
|> String.split(token)
|
|
40
|
+
|> Enum.count()) - 1
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
defp make_prefix(depth) do
|
|
44
|
+
String.duplicate("../", depth)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
defmodule Crawler.Linker do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
A set of high level functions for making online and offline URLs and links.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Linker.PathBuilder
|
|
7
|
+
alias Crawler.Linker.PathFinder
|
|
8
|
+
alias Crawler.Linker.PathOffliner
|
|
9
|
+
alias Crawler.Linker.PathPrefixer
|
|
10
|
+
|
|
11
|
+
@doc """
|
|
12
|
+
Given the `current_link`, it works out what the offline URL should be for
|
|
13
|
+
`link`.
|
|
14
|
+
|
|
15
|
+
## Examples
|
|
16
|
+
|
|
17
|
+
iex> Linker.offline_url(
|
|
18
|
+
iex> "http://hello.world/dir/page",
|
|
19
|
+
iex> "page1"
|
|
20
|
+
iex> )
|
|
21
|
+
"http://hello.world/dir/page1/index.html"
|
|
22
|
+
|
|
23
|
+
iex> Linker.offline_url(
|
|
24
|
+
iex> "http://hello.world/dir/page",
|
|
25
|
+
iex> "page1.html"
|
|
26
|
+
iex> )
|
|
27
|
+
"http://hello.world/dir/page1.html"
|
|
28
|
+
|
|
29
|
+
iex> Linker.offline_url(
|
|
30
|
+
iex> "http://hello.world/dir/page",
|
|
31
|
+
iex> "../page1"
|
|
32
|
+
iex> )
|
|
33
|
+
"http://hello.world/page1/index.html"
|
|
34
|
+
|
|
35
|
+
iex> Linker.offline_url(
|
|
36
|
+
iex> "http://hello.world/dir/page",
|
|
37
|
+
iex> "../page1.html"
|
|
38
|
+
iex> )
|
|
39
|
+
"http://hello.world/page1.html"
|
|
40
|
+
|
|
41
|
+
iex> Linker.offline_url(
|
|
42
|
+
iex> "http://hello.world/dir/page",
|
|
43
|
+
iex> "http://thank.you/page1"
|
|
44
|
+
iex> )
|
|
45
|
+
"http://thank.you/page1/index.html"
|
|
46
|
+
|
|
47
|
+
iex> Linker.offline_url(
|
|
48
|
+
iex> "http://hello.world/dir/page",
|
|
49
|
+
iex> "http://thank.you/page1.html"
|
|
50
|
+
iex> )
|
|
51
|
+
"http://thank.you/page1.html"
|
|
52
|
+
|
|
53
|
+
iex> Linker.offline_url(
|
|
54
|
+
iex> "http://hello.world/dir/page",
|
|
55
|
+
iex> "http://thank.you/"
|
|
56
|
+
iex> )
|
|
57
|
+
"http://thank.you/index.html"
|
|
58
|
+
"""
|
|
59
|
+
def offline_url(current_url, link) do
|
|
60
|
+
current_url
|
|
61
|
+
|> url(link)
|
|
62
|
+
|> PathOffliner.transform()
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
@doc """
|
|
66
|
+
Given the `current_link`, it works out what the relative
|
|
67
|
+
offline link should be for `link`.
|
|
68
|
+
|
|
69
|
+
## Examples
|
|
70
|
+
|
|
71
|
+
iex> Linker.offline_link(
|
|
72
|
+
iex> "http://hello.world/dir/page",
|
|
73
|
+
iex> "page1"
|
|
74
|
+
iex> )
|
|
75
|
+
"../../../hello.world/dir/page1/index.html"
|
|
76
|
+
|
|
77
|
+
iex> Linker.offline_link(
|
|
78
|
+
iex> "http://hello.world/dir/page",
|
|
79
|
+
iex> "page1.html"
|
|
80
|
+
iex> )
|
|
81
|
+
"../../../hello.world/dir/page1.html"
|
|
82
|
+
|
|
83
|
+
iex> Linker.offline_link(
|
|
84
|
+
iex> "http://hello.world/dir/page",
|
|
85
|
+
iex> "../page1"
|
|
86
|
+
iex> )
|
|
87
|
+
"../../../hello.world/page1/index.html"
|
|
88
|
+
|
|
89
|
+
iex> Linker.offline_link(
|
|
90
|
+
iex> "http://hello.world/dir/page",
|
|
91
|
+
iex> "../page1.html"
|
|
92
|
+
iex> )
|
|
93
|
+
"../../../hello.world/page1.html"
|
|
94
|
+
|
|
95
|
+
iex> Linker.offline_link(
|
|
96
|
+
iex> "http://hello.world/dir/page",
|
|
97
|
+
iex> "http://thank.you/page1"
|
|
98
|
+
iex> )
|
|
99
|
+
"../../../thank.you/page1/index.html"
|
|
100
|
+
|
|
101
|
+
iex> Linker.offline_link(
|
|
102
|
+
iex> "http://hello.world/dir/page",
|
|
103
|
+
iex> "http://thank.you/page1.html"
|
|
104
|
+
iex> )
|
|
105
|
+
"../../../thank.you/page1.html"
|
|
106
|
+
"""
|
|
107
|
+
def offline_link(current_url, link) do
|
|
108
|
+
current_url
|
|
109
|
+
|> link(link)
|
|
110
|
+
|> PathOffliner.transform()
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
@doc """
|
|
114
|
+
Given the `current_link`, it works out what the URL should be for `link`.
|
|
115
|
+
|
|
116
|
+
## Examples
|
|
117
|
+
|
|
118
|
+
iex> Linker.url(
|
|
119
|
+
iex> "http://another.domain:8888/page",
|
|
120
|
+
iex> "/dir/page2"
|
|
121
|
+
iex> )
|
|
122
|
+
"http://another.domain:8888/dir/page2"
|
|
123
|
+
|
|
124
|
+
iex> Linker.url(
|
|
125
|
+
iex> "http://another.domain:8888/parent/page",
|
|
126
|
+
iex> "dir/page2"
|
|
127
|
+
iex> )
|
|
128
|
+
"http://another.domain:8888/parent/dir/page2"
|
|
129
|
+
"""
|
|
130
|
+
def url(current_url, link) do
|
|
131
|
+
Path.join(
|
|
132
|
+
PathFinder.find_scheme(current_url),
|
|
133
|
+
PathBuilder.build_path(current_url, link, false)
|
|
134
|
+
)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
@doc """
|
|
138
|
+
Given the `current_link`, it works out what the relative link should be for
|
|
139
|
+
`link`.
|
|
140
|
+
|
|
141
|
+
## Examples
|
|
142
|
+
|
|
143
|
+
iex> Linker.link(
|
|
144
|
+
iex> "http://another.domain/page.html",
|
|
145
|
+
iex> "/dir/page2"
|
|
146
|
+
iex> )
|
|
147
|
+
"../another.domain/dir/page2"
|
|
148
|
+
|
|
149
|
+
iex> Linker.link(
|
|
150
|
+
iex> "http://another.domain/page",
|
|
151
|
+
iex> "/dir/page2"
|
|
152
|
+
iex> )
|
|
153
|
+
"../../another.domain/dir/page2"
|
|
154
|
+
|
|
155
|
+
iex> Linker.link(
|
|
156
|
+
iex> "http://another.domain/parent/page",
|
|
157
|
+
iex> "dir/page2"
|
|
158
|
+
iex> )
|
|
159
|
+
"../../../another.domain/parent/dir/page2"
|
|
160
|
+
|
|
161
|
+
iex> Linker.link(
|
|
162
|
+
iex> "http://another.domain/parent/page",
|
|
163
|
+
iex> "../dir/page2"
|
|
164
|
+
iex> )
|
|
165
|
+
"../../../another.domain/dir/page2"
|
|
166
|
+
"""
|
|
167
|
+
def link(current_url, link) do
|
|
168
|
+
Path.join(
|
|
169
|
+
PathPrefixer.prefix(current_url),
|
|
170
|
+
PathBuilder.build_path(current_url, link)
|
|
171
|
+
)
|
|
172
|
+
end
|
|
173
|
+
end
|