powerdlz23 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/grell/.rspec +2 -0
  92. package/grell/.travis.yml +28 -0
  93. package/grell/CHANGELOG.md +111 -0
  94. package/grell/Gemfile +7 -0
  95. package/grell/LICENSE.txt +22 -0
  96. package/grell/README.md +213 -0
  97. package/grell/Rakefile +2 -0
  98. package/grell/grell.gemspec +36 -0
  99. package/grell/lib/grell/capybara_driver.rb +44 -0
  100. package/grell/lib/grell/crawler.rb +83 -0
  101. package/grell/lib/grell/crawler_manager.rb +84 -0
  102. package/grell/lib/grell/grell_logger.rb +10 -0
  103. package/grell/lib/grell/page.rb +275 -0
  104. package/grell/lib/grell/page_collection.rb +62 -0
  105. package/grell/lib/grell/rawpage.rb +62 -0
  106. package/grell/lib/grell/reader.rb +18 -0
  107. package/grell/lib/grell/version.rb +3 -0
  108. package/grell/lib/grell.rb +11 -0
  109. package/grell/spec/lib/capybara_driver_spec.rb +38 -0
  110. package/grell/spec/lib/crawler_manager_spec.rb +174 -0
  111. package/grell/spec/lib/crawler_spec.rb +361 -0
  112. package/grell/spec/lib/page_collection_spec.rb +159 -0
  113. package/grell/spec/lib/page_spec.rb +418 -0
  114. package/grell/spec/lib/reader_spec.rb +43 -0
  115. package/grell/spec/spec_helper.rb +66 -0
  116. package/heartmagic/config.py +1 -0
  117. package/heartmagic/heart.py +3 -0
  118. package/heartmagic/pytransform/__init__.py +483 -0
  119. package/heartmagic/pytransform/_pytransform.dll +0 -0
  120. package/heartmagic/pytransform/_pytransform.so +0 -0
  121. package/httpStatusCode/README.md +2 -0
  122. package/httpStatusCode/httpStatusCode.js +4 -0
  123. package/httpStatusCode/reasonPhrases.js +344 -0
  124. package/httpStatusCode/statusCodes.js +344 -0
  125. package/package.json +1 -1
  126. package/rubyretriever/.rspec +2 -0
  127. package/rubyretriever/.travis.yml +7 -0
  128. package/rubyretriever/Gemfile +3 -0
  129. package/rubyretriever/Gemfile.lock +64 -0
  130. package/rubyretriever/LICENSE +20 -0
  131. package/rubyretriever/Rakefile +7 -0
  132. package/rubyretriever/bin/rr +79 -0
  133. package/rubyretriever/lib/retriever/cli.rb +25 -0
  134. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  135. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  136. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  137. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  138. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  139. package/rubyretriever/lib/retriever/link.rb +47 -0
  140. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  141. package/rubyretriever/lib/retriever/page.rb +104 -0
  142. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  143. package/rubyretriever/lib/retriever/target.rb +47 -0
  144. package/rubyretriever/lib/retriever/version.rb +4 -0
  145. package/rubyretriever/lib/retriever.rb +15 -0
  146. package/rubyretriever/readme.md +166 -0
  147. package/rubyretriever/rubyretriever.gemspec +41 -0
  148. package/rubyretriever/spec/link_spec.rb +77 -0
  149. package/rubyretriever/spec/page_spec.rb +94 -0
  150. package/rubyretriever/spec/retriever_spec.rb +84 -0
  151. package/rubyretriever/spec/spec_helper.rb +17 -0
  152. package/rubyretriever/spec/target_spec.rb +55 -0
  153. package/snapcrawl/.changelog.old.md +157 -0
  154. package/snapcrawl/.gitattributes +1 -0
  155. package/snapcrawl/.github/workflows/test.yml +41 -0
  156. package/snapcrawl/.rspec +3 -0
  157. package/snapcrawl/.rubocop.yml +23 -0
  158. package/snapcrawl/CHANGELOG.md +182 -0
  159. package/snapcrawl/Gemfile +15 -0
  160. package/snapcrawl/LICENSE +21 -0
  161. package/snapcrawl/README.md +135 -0
  162. package/snapcrawl/Runfile +35 -0
  163. package/snapcrawl/bin/snapcrawl +25 -0
  164. package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
  165. package/snapcrawl/lib/snapcrawl/config.rb +60 -0
  166. package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
  167. package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
  168. package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
  169. package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
  170. package/snapcrawl/lib/snapcrawl/page.rb +118 -0
  171. package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
  172. package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
  173. package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
  174. package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
  175. package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
  176. package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
  177. package/snapcrawl/lib/snapcrawl/version.rb +3 -0
  178. package/snapcrawl/lib/snapcrawl.rb +20 -0
  179. package/snapcrawl/snapcrawl.gemspec +27 -0
  180. package/snapcrawl/snapcrawl.yml +41 -0
  181. package/snapcrawl/spec/README.md +16 -0
  182. package/snapcrawl/spec/approvals/bin/help +26 -0
  183. package/snapcrawl/spec/approvals/bin/usage +4 -0
  184. package/snapcrawl/spec/approvals/cli/usage +4 -0
  185. package/snapcrawl/spec/approvals/config/defaults +15 -0
  186. package/snapcrawl/spec/approvals/config/minimal +15 -0
  187. package/snapcrawl/spec/approvals/integration/blacklist +14 -0
  188. package/snapcrawl/spec/approvals/integration/default-config +14 -0
  189. package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
  190. package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
  191. package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
  192. package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
  193. package/snapcrawl/spec/approvals/integration/whitelist +14 -0
  194. package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
  195. package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
  196. package/snapcrawl/spec/server/config.ru +97 -0
  197. package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
  198. package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
  199. package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
  200. package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
  201. package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
  202. package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
  203. package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
  204. package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
  205. package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
  206. package/snapcrawl/spec/spec_helper.rb +22 -0
  207. package/snapcrawl/spec/spec_mixin.rb +10 -0
@@ -0,0 +1,81 @@
1
+ defmodule Crawler.Fetcher do
2
+ @moduledoc """
3
+ Fetches pages and perform tasks on them.
4
+ """
5
+
6
+ require Logger
7
+
8
+ alias Crawler.Fetcher.HeaderPreparer
9
+ alias Crawler.Fetcher.Policer
10
+ alias Crawler.Fetcher.Recorder
11
+ alias Crawler.Fetcher.Requester
12
+ alias Crawler.Snapper
13
+ alias Crawler.Store.Page
14
+
15
+ @doc """
16
+ Fetches a URL by:
17
+
18
+ - verifying whether the URL needs fetching through `Crawler.Fetcher.Policer.police/1`
19
+ - recording data for internal use through `Crawler.Fetcher.Recorder.record/1`
20
+ - fetching the URL
21
+ - performing retries upon failed fetches through `Crawler.Fetcher.Retrier.perform/2`
22
+ """
23
+ def fetch(opts) do
24
+ with {:ok, opts} <- Policer.police(opts),
25
+ {:ok, opts} <- Recorder.record(opts) do
26
+ opts[:retrier].perform(fn -> fetch_url(opts) end, opts)
27
+ end
28
+ end
29
+
30
+ defp fetch_url(opts) do
31
+ case Requester.make(opts) do
32
+ {:ok, %HTTPoison.Response{status_code: 200, body: body, headers: headers}} ->
33
+ fetch_url_200(body, headers, opts)
34
+
35
+ {:ok, %HTTPoison.Response{status_code: status_code}} ->
36
+ fetch_url_non_200(status_code, opts)
37
+
38
+ {:error, %HTTPoison.Error{reason: reason}} ->
39
+ fetch_url_failed(reason, opts)
40
+ end
41
+ end
42
+
43
+ defp fetch_url_200(body, headers, opts) do
44
+ with opts <- HeaderPreparer.prepare(headers, opts),
45
+ {:ok, _} <- Recorder.maybe_store_page(body, opts),
46
+ {:ok, opts} <- record_referrer_url(opts),
47
+ {:ok, _} <- snap_page(body, opts) do
48
+ Logger.debug("Fetched #{opts[:url]}")
49
+
50
+ %Page{url: opts[:url], body: body, opts: opts}
51
+ end
52
+ end
53
+
54
+ defp fetch_url_non_200(status_code, opts) do
55
+ msg = "Failed to fetch #{opts[:url]}, status code: #{status_code}"
56
+
57
+ Logger.debug(msg)
58
+
59
+ {:warn, msg}
60
+ end
61
+
62
+ defp fetch_url_failed(reason, opts) do
63
+ msg = "Failed to fetch #{opts[:url]}, reason: #{inspect(reason)}"
64
+
65
+ Logger.debug(msg)
66
+
67
+ {:warn, msg}
68
+ end
69
+
70
+ defp record_referrer_url(opts) do
71
+ {:ok, Map.put(opts, :referrer_url, opts[:url])}
72
+ end
73
+
74
+ defp snap_page(body, opts) do
75
+ if opts[:save_to] do
76
+ Snapper.snap(body, opts)
77
+ else
78
+ {:ok, ""}
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,7 @@
1
+ defmodule Crawler.HTTP do
2
+ @moduledoc """
3
+ Custom HTTPoison base module for potential customisation.
4
+ """
5
+
6
+ use HTTPoison.Base
7
+ end
@@ -0,0 +1,71 @@
1
+ defmodule Crawler.Linker.PathBuilder do
2
+ @moduledoc """
3
+ Builds a path for a link (can be a URL itself or a relative link) based on
4
+ the input string which is a URL with or without its protocol.
5
+ """
6
+
7
+ alias Crawler.Linker.PathExpander
8
+ alias Crawler.Linker.PathFinder
9
+
10
+ @doc """
11
+ Builds a path for a link (can be a URL itself or a relative link) based on
12
+ the input string which is a URL with or without its protocol.
13
+
14
+ ## Examples
15
+
16
+ iex> PathBuilder.build_path(
17
+ iex> "https://cool.beans:7777/dir/page1",
18
+ iex> "https://hello.world:8888/remote/page2"
19
+ iex> )
20
+ "hello.world-8888/remote/page2"
21
+
22
+ iex> PathBuilder.build_path(
23
+ iex> "https://cool.beans:7777/dir/page1",
24
+ iex> "local/page2"
25
+ iex> )
26
+ "cool.beans-7777/dir/local/page2"
27
+
28
+ iex> PathBuilder.build_path(
29
+ iex> "https://cool.beans:7777/dir/page1",
30
+ iex> "/local/page2"
31
+ iex> )
32
+ "cool.beans-7777/local/page2"
33
+
34
+ iex> PathBuilder.build_path(
35
+ iex> "https://cool.beans:7777/parent/dir/page1",
36
+ iex> "../local/page2"
37
+ iex> )
38
+ "cool.beans-7777/parent/local/page2"
39
+
40
+ iex> PathBuilder.build_path(
41
+ iex> "https://cool.beans:7777/parent/dir/page1",
42
+ iex> "../../local/page2"
43
+ iex> )
44
+ "cool.beans-7777/local/page2"
45
+ """
46
+ def build_path(current_url, link, safe \\ true) do
47
+ current_url
48
+ |> base_path(link, safe)
49
+ |> build(link, safe)
50
+ end
51
+
52
+ defp base_path(url, "/" <> _link, safe), do: PathFinder.find_domain(url, safe)
53
+ defp base_path(url, _link, safe), do: PathFinder.find_base_path(url, safe)
54
+
55
+ defp build(path, link, safe) do
56
+ link
57
+ |> normalise(path)
58
+ |> PathFinder.find_path(safe)
59
+ |> PathExpander.expand_dot()
60
+ end
61
+
62
+ defp normalise(link, path) do
63
+ link
64
+ |> String.split("://", parts: 2)
65
+ |> Enum.count()
66
+ |> join_path(link, path)
67
+ end
68
+
69
+ defp join_path(2, link, _path), do: link
70
+ defp join_path(1, link, path), do: Path.join(path, link)
71
+ end
@@ -0,0 +1,59 @@
1
+ defmodule Crawler.Linker.PathExpander do
2
+ @moduledoc """
3
+ Expands the path by expanding any `.` and `..` characters.
4
+
5
+ See [this pull request](https://github.com/elixir-lang/elixir/pull/6486).
6
+ """
7
+
8
+ @doc """
9
+ Expands the path by expanding any `.` and `..` characters.
10
+
11
+ Note this function only expands any `.` and `..` characters within the given
12
+ path, and it does not take into account the absolute or relative nature of
13
+ the path itself, for that please use `expand/1` or `expand/2`.
14
+
15
+ ## Examples
16
+
17
+ ### Intended use case
18
+
19
+ iex> PathExpander.expand_dot("foo/bar/../baz")
20
+ "foo/baz"
21
+
22
+ iex> PathExpander.expand_dot("/foo/bar/../baz")
23
+ "/foo/baz"
24
+
25
+ ### Non-intended use cases are ignored
26
+
27
+ iex> PathExpander.expand_dot("foo/bar/./baz")
28
+ "foo/bar/baz"
29
+
30
+ iex> PathExpander.expand_dot("../foo/bar")
31
+ "foo/bar"
32
+ """
33
+ def expand_dot(<<"/", rest::binary>>),
34
+ do: "/" <> do_expand_dot(rest)
35
+
36
+ def expand_dot(path),
37
+ do: do_expand_dot(path)
38
+
39
+ defp do_expand_dot(path),
40
+ do: do_expand_dot(:binary.split(path, "/", [:global]), [])
41
+
42
+ defp do_expand_dot([".." | t], [_, _ | acc]),
43
+ do: do_expand_dot(t, acc)
44
+
45
+ defp do_expand_dot([".." | t], []),
46
+ do: do_expand_dot(t, [])
47
+
48
+ defp do_expand_dot(["." | t], acc),
49
+ do: do_expand_dot(t, acc)
50
+
51
+ defp do_expand_dot([h | t], acc),
52
+ do: do_expand_dot(t, ["/", h | acc])
53
+
54
+ defp do_expand_dot([], []),
55
+ do: ""
56
+
57
+ defp do_expand_dot([], ["/" | acc]),
58
+ do: IO.iodata_to_binary(:lists.reverse(acc))
59
+ end
@@ -0,0 +1,106 @@
1
+ defmodule Crawler.Linker.PathFinder do
2
+ @moduledoc """
3
+ Finds different components of a given URL, e.g. its domain name, directory
4
+ path, or full path.
5
+
6
+ The `safe` option in some the functions indicates whether the return value
7
+ should be transformed in order to be safely used as folder and file names.
8
+ """
9
+
10
+ @doc """
11
+ Finds the URL scheme (e.g. `https://`).
12
+
13
+ ## Examples
14
+
15
+ iex> PathFinder.find_scheme("http://hi.hello")
16
+ "http://"
17
+
18
+ iex> PathFinder.find_scheme("https://hi.hello:8888/")
19
+ "https://"
20
+ """
21
+ def find_scheme(url) do
22
+ (url
23
+ |> String.split("://", part: 2)
24
+ |> Kernel.hd()) <> "://"
25
+ end
26
+
27
+ @doc """
28
+ Finds the domain name with port number (e.g. `example.org:8080`).
29
+
30
+ ## Examples
31
+
32
+ iex> PathFinder.find_domain("http://hi.hello")
33
+ "hi.hello"
34
+
35
+ iex> PathFinder.find_domain("https://hi.hello:8888/world")
36
+ "hi.hello-8888"
37
+
38
+ iex> PathFinder.find_domain("https://hi.hello:8888/world", false)
39
+ "hi.hello:8888"
40
+ """
41
+ def find_domain(url, safe \\ true) do
42
+ url
43
+ |> find_path(safe)
44
+ |> String.split("/", parts: 2)
45
+ |> Kernel.hd()
46
+ end
47
+
48
+ @doc """
49
+ Finds the base path of a given page.
50
+
51
+ ## Examples
52
+
53
+ iex> PathFinder.find_base_path("http://hi.hello")
54
+ "hi.hello"
55
+
56
+ iex> PathFinder.find_base_path("https://hi.hello:8888/dir/world")
57
+ "hi.hello-8888/dir"
58
+
59
+ iex> PathFinder.find_base_path("https://hi.hello:8888/dir/world", false)
60
+ "hi.hello:8888/dir"
61
+ """
62
+ def find_base_path(url, safe \\ true) do
63
+ url
64
+ |> find_path(safe)
65
+ |> String.split("/")
66
+ |> base_path()
67
+ end
68
+
69
+ defp base_path([path]), do: path
70
+
71
+ defp base_path(list) do
72
+ [_head | tail] = Enum.reverse(list)
73
+
74
+ tail
75
+ |> Enum.reverse()
76
+ |> Path.join()
77
+ end
78
+
79
+ @doc """
80
+ Finds the full path of a given page.
81
+
82
+ ## Examples
83
+
84
+ iex> PathFinder.find_path("http://hi.hello")
85
+ "hi.hello"
86
+
87
+ iex> PathFinder.find_path("https://hi.hello:8888/world")
88
+ "hi.hello-8888/world"
89
+
90
+ iex> PathFinder.find_path("https://hi.hello:8888/world", false)
91
+ "hi.hello:8888/world"
92
+ """
93
+ def find_path(url, safe \\ true)
94
+
95
+ def find_path(url, false) do
96
+ url
97
+ |> String.split("://", parts: 2)
98
+ |> Enum.at(-1)
99
+ end
100
+
101
+ def find_path(url, true) do
102
+ url
103
+ |> find_path(false)
104
+ |> String.replace(":", "-")
105
+ end
106
+ end
@@ -0,0 +1,59 @@
1
+ defmodule Crawler.Linker.PathOffliner do
2
+ @moduledoc """
3
+ Transforms a link to be storable and linkable offline.
4
+ """
5
+
6
+ alias Crawler.Linker.PathFinder
7
+
8
+ @doc """
9
+ Transforms a given link so that it can be stored and linked to by other pages.
10
+
11
+ When a page does not have a file extension (e.g. html) it is treated as the
12
+ index page for a directory, therefore `index.html` is appended to the link.
13
+
14
+ ## Examples
15
+
16
+ iex> PathOffliner.transform("http://hello.world")
17
+ "http://hello.world/index.html"
18
+
19
+ iex> PathOffliner.transform("hello.world")
20
+ "hello.world/index.html"
21
+
22
+ iex> PathOffliner.transform("hello.world/")
23
+ "hello.world/index.html"
24
+
25
+ iex> PathOffliner.transform("hello/world")
26
+ "hello/world/index.html"
27
+
28
+ iex> PathOffliner.transform("hello/world.html")
29
+ "hello/world.html"
30
+ """
31
+ def transform(link) do
32
+ link
33
+ |> PathFinder.find_path()
34
+ |> String.split("/", trim: true)
35
+ |> Enum.count()
36
+ |> last_segment(link)
37
+ end
38
+
39
+ defp last_segment(1, link) do
40
+ transform_link(false, link)
41
+ end
42
+
43
+ defp last_segment(_count, link) do
44
+ link
45
+ |> String.split("/")
46
+ |> Enum.take(-1)
47
+ |> Kernel.hd()
48
+ |> transform_segment(link)
49
+ end
50
+
51
+ defp transform_segment(segment, link) do
52
+ segment
53
+ |> String.contains?(".")
54
+ |> transform_link(link)
55
+ end
56
+
57
+ defp transform_link(true, link), do: link
58
+ defp transform_link(false, link), do: Path.join(link, "index.html")
59
+ end
@@ -0,0 +1,46 @@
1
+ defmodule Crawler.Linker.PathPrefixer do
2
+ @moduledoc """
3
+ Returns prefixes (`../`s) according to the given URL's structure.
4
+ """
5
+
6
+ alias Crawler.Linker.PathFinder
7
+ alias Crawler.Linker.PathOffliner
8
+
9
+ @doc """
10
+ Returns prefixes (`../`s) according to the given URL's structure.
11
+
12
+ ## Examples
13
+
14
+ iex> PathPrefixer.prefix("https://hello.world/")
15
+ "../"
16
+
17
+ iex> PathPrefixer.prefix("https://hello.world/page.html")
18
+ "../"
19
+
20
+ iex> PathPrefixer.prefix("https://hello.world/page")
21
+ "../../"
22
+
23
+ iex> PathPrefixer.prefix("https://hello.world/dir/page.html")
24
+ "../../"
25
+
26
+ iex> PathPrefixer.prefix("https://hello.world/dir/page")
27
+ "../../../"
28
+ """
29
+ def prefix(current_url) do
30
+ current_url
31
+ |> PathFinder.find_path()
32
+ |> PathOffliner.transform()
33
+ |> count_depth()
34
+ |> make_prefix()
35
+ end
36
+
37
+ defp count_depth(string, token \\ "/") do
38
+ (string
39
+ |> String.split(token)
40
+ |> Enum.count()) - 1
41
+ end
42
+
43
+ defp make_prefix(depth) do
44
+ String.duplicate("../", depth)
45
+ end
46
+ end
@@ -0,0 +1,173 @@
1
+ defmodule Crawler.Linker do
2
+ @moduledoc """
3
+ A set of high level functions for making online and offline URLs and links.
4
+ """
5
+
6
+ alias Crawler.Linker.PathBuilder
7
+ alias Crawler.Linker.PathFinder
8
+ alias Crawler.Linker.PathOffliner
9
+ alias Crawler.Linker.PathPrefixer
10
+
11
+ @doc """
12
+ Given the `current_link`, it works out what the offline URL should be for
13
+ `link`.
14
+
15
+ ## Examples
16
+
17
+ iex> Linker.offline_url(
18
+ iex> "http://hello.world/dir/page",
19
+ iex> "page1"
20
+ iex> )
21
+ "http://hello.world/dir/page1/index.html"
22
+
23
+ iex> Linker.offline_url(
24
+ iex> "http://hello.world/dir/page",
25
+ iex> "page1.html"
26
+ iex> )
27
+ "http://hello.world/dir/page1.html"
28
+
29
+ iex> Linker.offline_url(
30
+ iex> "http://hello.world/dir/page",
31
+ iex> "../page1"
32
+ iex> )
33
+ "http://hello.world/page1/index.html"
34
+
35
+ iex> Linker.offline_url(
36
+ iex> "http://hello.world/dir/page",
37
+ iex> "../page1.html"
38
+ iex> )
39
+ "http://hello.world/page1.html"
40
+
41
+ iex> Linker.offline_url(
42
+ iex> "http://hello.world/dir/page",
43
+ iex> "http://thank.you/page1"
44
+ iex> )
45
+ "http://thank.you/page1/index.html"
46
+
47
+ iex> Linker.offline_url(
48
+ iex> "http://hello.world/dir/page",
49
+ iex> "http://thank.you/page1.html"
50
+ iex> )
51
+ "http://thank.you/page1.html"
52
+
53
+ iex> Linker.offline_url(
54
+ iex> "http://hello.world/dir/page",
55
+ iex> "http://thank.you/"
56
+ iex> )
57
+ "http://thank.you/index.html"
58
+ """
59
+ def offline_url(current_url, link) do
60
+ current_url
61
+ |> url(link)
62
+ |> PathOffliner.transform()
63
+ end
64
+
65
+ @doc """
66
+ Given the `current_link`, it works out what the relative
67
+ offline link should be for `link`.
68
+
69
+ ## Examples
70
+
71
+ iex> Linker.offline_link(
72
+ iex> "http://hello.world/dir/page",
73
+ iex> "page1"
74
+ iex> )
75
+ "../../../hello.world/dir/page1/index.html"
76
+
77
+ iex> Linker.offline_link(
78
+ iex> "http://hello.world/dir/page",
79
+ iex> "page1.html"
80
+ iex> )
81
+ "../../../hello.world/dir/page1.html"
82
+
83
+ iex> Linker.offline_link(
84
+ iex> "http://hello.world/dir/page",
85
+ iex> "../page1"
86
+ iex> )
87
+ "../../../hello.world/page1/index.html"
88
+
89
+ iex> Linker.offline_link(
90
+ iex> "http://hello.world/dir/page",
91
+ iex> "../page1.html"
92
+ iex> )
93
+ "../../../hello.world/page1.html"
94
+
95
+ iex> Linker.offline_link(
96
+ iex> "http://hello.world/dir/page",
97
+ iex> "http://thank.you/page1"
98
+ iex> )
99
+ "../../../thank.you/page1/index.html"
100
+
101
+ iex> Linker.offline_link(
102
+ iex> "http://hello.world/dir/page",
103
+ iex> "http://thank.you/page1.html"
104
+ iex> )
105
+ "../../../thank.you/page1.html"
106
+ """
107
+ def offline_link(current_url, link) do
108
+ current_url
109
+ |> link(link)
110
+ |> PathOffliner.transform()
111
+ end
112
+
113
+ @doc """
114
+ Given the `current_link`, it works out what the URL should be for `link`.
115
+
116
+ ## Examples
117
+
118
+ iex> Linker.url(
119
+ iex> "http://another.domain:8888/page",
120
+ iex> "/dir/page2"
121
+ iex> )
122
+ "http://another.domain:8888/dir/page2"
123
+
124
+ iex> Linker.url(
125
+ iex> "http://another.domain:8888/parent/page",
126
+ iex> "dir/page2"
127
+ iex> )
128
+ "http://another.domain:8888/parent/dir/page2"
129
+ """
130
+ def url(current_url, link) do
131
+ Path.join(
132
+ PathFinder.find_scheme(current_url),
133
+ PathBuilder.build_path(current_url, link, false)
134
+ )
135
+ end
136
+
137
+ @doc """
138
+ Given the `current_link`, it works out what the relative link should be for
139
+ `link`.
140
+
141
+ ## Examples
142
+
143
+ iex> Linker.link(
144
+ iex> "http://another.domain/page.html",
145
+ iex> "/dir/page2"
146
+ iex> )
147
+ "../another.domain/dir/page2"
148
+
149
+ iex> Linker.link(
150
+ iex> "http://another.domain/page",
151
+ iex> "/dir/page2"
152
+ iex> )
153
+ "../../another.domain/dir/page2"
154
+
155
+ iex> Linker.link(
156
+ iex> "http://another.domain/parent/page",
157
+ iex> "dir/page2"
158
+ iex> )
159
+ "../../../another.domain/parent/dir/page2"
160
+
161
+ iex> Linker.link(
162
+ iex> "http://another.domain/parent/page",
163
+ iex> "../dir/page2"
164
+ iex> )
165
+ "../../../another.domain/dir/page2"
166
+ """
167
+ def link(current_url, link) do
168
+ Path.join(
169
+ PathPrefixer.prefix(current_url),
170
+ PathBuilder.build_path(current_url, link)
171
+ )
172
+ end
173
+ end