powerdlz23 1.2.3 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/Spider/README.md +19 -0
  2. package/Spider/domain.py +18 -0
  3. package/Spider/general.py +51 -0
  4. package/Spider/link_finder.py +25 -0
  5. package/Spider/main.py +50 -0
  6. package/Spider/spider.py +74 -0
  7. package/crawler/.formatter.exs +5 -0
  8. package/crawler/.github/workflows/ci.yml +29 -0
  9. package/crawler/.recode.exs +33 -0
  10. package/crawler/.tool-versions +2 -0
  11. package/crawler/CHANGELOG.md +82 -0
  12. package/crawler/README.md +198 -0
  13. package/crawler/architecture.svg +4 -0
  14. package/crawler/config/config.exs +9 -0
  15. package/crawler/config/dev.exs +5 -0
  16. package/crawler/config/test.exs +5 -0
  17. package/crawler/examples/google_search/scraper.ex +37 -0
  18. package/crawler/examples/google_search/url_filter.ex +11 -0
  19. package/crawler/examples/google_search.ex +77 -0
  20. package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
  21. package/crawler/lib/crawler/dispatcher.ex +20 -0
  22. package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
  23. package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
  24. package/crawler/lib/crawler/fetcher/policer.ex +77 -0
  25. package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
  26. package/crawler/lib/crawler/fetcher/requester.ex +32 -0
  27. package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
  28. package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
  29. package/crawler/lib/crawler/fetcher.ex +81 -0
  30. package/crawler/lib/crawler/http.ex +7 -0
  31. package/crawler/lib/crawler/linker/path_builder.ex +71 -0
  32. package/crawler/lib/crawler/linker/path_expander.ex +59 -0
  33. package/crawler/lib/crawler/linker/path_finder.ex +106 -0
  34. package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
  35. package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
  36. package/crawler/lib/crawler/linker.ex +173 -0
  37. package/crawler/lib/crawler/options.ex +127 -0
  38. package/crawler/lib/crawler/parser/css_parser.ex +37 -0
  39. package/crawler/lib/crawler/parser/guarder.ex +38 -0
  40. package/crawler/lib/crawler/parser/html_parser.ex +41 -0
  41. package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
  42. package/crawler/lib/crawler/parser/link_parser.ex +50 -0
  43. package/crawler/lib/crawler/parser.ex +122 -0
  44. package/crawler/lib/crawler/queue_handler.ex +45 -0
  45. package/crawler/lib/crawler/scraper.ex +28 -0
  46. package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
  47. package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
  48. package/crawler/lib/crawler/snapper.ex +82 -0
  49. package/crawler/lib/crawler/store/counter.ex +19 -0
  50. package/crawler/lib/crawler/store/page.ex +7 -0
  51. package/crawler/lib/crawler/store.ex +87 -0
  52. package/crawler/lib/crawler/worker.ex +62 -0
  53. package/crawler/lib/crawler.ex +91 -0
  54. package/crawler/mix.exs +78 -0
  55. package/crawler/mix.lock +40 -0
  56. package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
  57. package/crawler/test/integration_test.exs +135 -0
  58. package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
  59. package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
  60. package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
  61. package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
  62. package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
  63. package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
  64. package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
  65. package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
  66. package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
  67. package/crawler/test/lib/crawler/http_test.exs +47 -0
  68. package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
  69. package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
  70. package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
  71. package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
  72. package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
  73. package/crawler/test/lib/crawler/linker_test.exs +7 -0
  74. package/crawler/test/lib/crawler/options_test.exs +7 -0
  75. package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
  76. package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
  77. package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
  78. package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
  79. package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
  80. package/crawler/test/lib/crawler/parser_test.exs +8 -0
  81. package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
  82. package/crawler/test/lib/crawler/scraper_test.exs +7 -0
  83. package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
  84. package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
  85. package/crawler/test/lib/crawler/snapper_test.exs +9 -0
  86. package/crawler/test/lib/crawler/worker_test.exs +5 -0
  87. package/crawler/test/lib/crawler_test.exs +295 -0
  88. package/crawler/test/support/test_case.ex +24 -0
  89. package/crawler/test/support/test_helpers.ex +28 -0
  90. package/crawler/test/test_helper.exs +7 -0
  91. package/package.json +1 -1
  92. package/rubyretriever/.rspec +2 -0
  93. package/rubyretriever/.travis.yml +7 -0
  94. package/rubyretriever/Gemfile +3 -0
  95. package/rubyretriever/Gemfile.lock +64 -0
  96. package/rubyretriever/LICENSE +20 -0
  97. package/rubyretriever/Rakefile +7 -0
  98. package/rubyretriever/bin/rr +79 -0
  99. package/rubyretriever/lib/retriever/cli.rb +25 -0
  100. package/rubyretriever/lib/retriever/core_ext.rb +13 -0
  101. package/rubyretriever/lib/retriever/fetch.rb +268 -0
  102. package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
  103. package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
  104. package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
  105. package/rubyretriever/lib/retriever/link.rb +47 -0
  106. package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
  107. package/rubyretriever/lib/retriever/page.rb +104 -0
  108. package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
  109. package/rubyretriever/lib/retriever/target.rb +47 -0
  110. package/rubyretriever/lib/retriever/version.rb +4 -0
  111. package/rubyretriever/lib/retriever.rb +15 -0
  112. package/rubyretriever/readme.md +166 -0
  113. package/rubyretriever/rubyretriever.gemspec +41 -0
  114. package/rubyretriever/spec/link_spec.rb +77 -0
  115. package/rubyretriever/spec/page_spec.rb +94 -0
  116. package/rubyretriever/spec/retriever_spec.rb +84 -0
  117. package/rubyretriever/spec/spec_helper.rb +17 -0
  118. package/rubyretriever/spec/target_spec.rb +55 -0
@@ -0,0 +1,127 @@
1
+ defmodule Crawler.Options do
2
+ @moduledoc """
3
+ Options for the crawler.
4
+ """
5
+
6
+ alias Crawler.Mixfile
7
+ alias Crawler.Store
8
+
9
+ @assets []
10
+ @save_to nil
11
+ @workers 10
12
+ @interval 0
13
+ @max_depths 3
14
+ @max_pages :infinity
15
+ @timeout 5_000
16
+ @retries 2
17
+ @store nil
18
+ @force false
19
+ @scope nil
20
+ @user_agent "Crawler/#{Mixfile.project()[:version]} (https://github.com/fredwu/crawler)"
21
+ @url_filter Crawler.Fetcher.UrlFilter
22
+ @retrier Crawler.Fetcher.Retrier
23
+ @modifier Crawler.Fetcher.Modifier
24
+ @scraper Crawler.Scraper
25
+ @parser Crawler.Parser
26
+ @encode_uri false
27
+ @queue nil
28
+
29
+ @doc """
30
+ Assigns default option values.
31
+
32
+ ## Examples
33
+
34
+ iex> Options.assign_defaults(%{}) |> Map.has_key?(:depth)
35
+ true
36
+
37
+ iex> Options.assign_defaults(%{}) |> Map.get(:max_depths)
38
+ 3
39
+
40
+ iex> Options.assign_defaults(%{max_depths: 4}) |> Map.get(:max_depths)
41
+ 4
42
+ """
43
+ def assign_defaults(opts) do
44
+ Map.merge(
45
+ %{
46
+ depth: 0,
47
+ html_tag: "a",
48
+ assets: assets(),
49
+ save_to: save_to(),
50
+ workers: workers(),
51
+ interval: interval(),
52
+ max_depths: max_depths(),
53
+ max_pages: max_pages(),
54
+ timeout: timeout(),
55
+ retries: retries(),
56
+ store: store(),
57
+ force: force(),
58
+ scope: scope(),
59
+ user_agent: user_agent(),
60
+ url_filter: url_filter(),
61
+ retrier: retrier(),
62
+ modifier: modifier(),
63
+ scraper: scraper(),
64
+ parser: parser(),
65
+ encode_uri: encode_uri(),
66
+ queue: queue()
67
+ },
68
+ opts
69
+ )
70
+ end
71
+
72
+ @doc """
73
+ Takes the `url` argument and puts it in the `opts`.
74
+
75
+ The `opts` map gets passed around internally and eventually gets stored in
76
+ the registry.
77
+
78
+ ## Examples
79
+
80
+ iex> Options.assign_url(%{}, "http://options/")
81
+ %{url: "http://options/"}
82
+
83
+ iex> Options.assign_url(%{url: "http://example.com/"}, "http://options/")
84
+ %{url: "http://options/"}
85
+ """
86
+ def assign_url(%{encode_uri: true} = opts, url) do
87
+ Map.merge(opts, %{url: URI.encode(url)})
88
+ end
89
+
90
+ def assign_url(opts, url) do
91
+ Map.merge(opts, %{url: url})
92
+ end
93
+
94
+ def assign_scope(%{force: true, scope: nil} = opts) do
95
+ Map.merge(opts, %{scope: System.unique_integer()})
96
+ end
97
+
98
+ def assign_scope(opts), do: opts
99
+
100
+ def perform_default_actions(%{depth: 0} = opts) do
101
+ Store.ops_reset()
102
+
103
+ opts
104
+ end
105
+
106
+ def perform_default_actions(opts), do: opts
107
+
108
+ defp assets, do: Application.get_env(:crawler, :assets, @assets)
109
+ defp save_to, do: Application.get_env(:crawler, :save_to, @save_to)
110
+ defp workers, do: Application.get_env(:crawler, :workers, @workers)
111
+ defp interval, do: Application.get_env(:crawler, :interval, @interval)
112
+ defp max_depths, do: Application.get_env(:crawler, :max_depths, @max_depths)
113
+ defp max_pages, do: Application.get_env(:crawler, :max_pages, @max_pages)
114
+ defp timeout, do: Application.get_env(:crawler, :timeout, @timeout)
115
+ defp retries, do: Application.get_env(:crawler, :retries, @retries)
116
+ defp store, do: Application.get_env(:crawler, :store, @store)
117
+ defp force, do: Application.get_env(:crawler, :force, @force)
118
+ defp scope, do: Application.get_env(:crawler, :scope, @scope)
119
+ defp user_agent, do: Application.get_env(:crawler, :user_agent, @user_agent)
120
+ defp url_filter, do: Application.get_env(:crawler, :url_filter, @url_filter)
121
+ defp retrier, do: Application.get_env(:crawler, :retrier, @retrier)
122
+ defp modifier, do: Application.get_env(:crawler, :modifier, @modifier)
123
+ defp scraper, do: Application.get_env(:crawler, :scraper, @scraper)
124
+ defp parser, do: Application.get_env(:crawler, :parser, @parser)
125
+ defp encode_uri, do: Application.get_env(:crawler, :encode_uri, @encode_uri)
126
+ defp queue, do: Application.get_env(:crawler, :queue, @queue)
127
+ end
@@ -0,0 +1,37 @@
1
+ defmodule Crawler.Parser.CssParser do
2
+ @moduledoc """
3
+ Parses CSS files.
4
+ """
5
+
6
+ @url_unsafe_chars ")'\""
7
+
8
+ @doc """
9
+ Parses CSS files.
10
+
11
+ ## Examples
12
+
13
+ iex> CssParser.parse(
14
+ iex> "img { url(http://hello.world) }"
15
+ iex> )
16
+ [{"link", [{"href", "http://hello.world"}], []}]
17
+
18
+ iex> CssParser.parse(
19
+ iex> "@font-face { src: url('icons.ttf') format('truetype'); }"
20
+ iex> )
21
+ [{"link", [{"href", "icons.ttf"}], []}]
22
+
23
+ iex> CssParser.parse(
24
+ iex> "@font-face { src: url('data:application/blah'); }"
25
+ iex> )
26
+ []
27
+ """
28
+ def parse(body) do
29
+ ~r{url\(['"]?((?!data:)[^#{@url_unsafe_chars}]+)['"]?\)}
30
+ |> Regex.scan(body, capture: :all_but_first)
31
+ |> Enum.map(&prep_css_element/1)
32
+ end
33
+
34
+ defp prep_css_element([link]) do
35
+ {"link", [{"href", link}], []}
36
+ end
37
+ end
@@ -0,0 +1,38 @@
1
+ defmodule Crawler.Parser.Guarder do
2
+ @moduledoc """
3
+ Detects whether a page is parsable.
4
+ """
5
+
6
+ @doc """
7
+ Detects whether a page is parsable.
8
+
9
+ ## Examples
10
+
11
+ iex> Guarder.pass?(
12
+ iex> %{html_tag: "link", content_type: "text/css"}
13
+ iex> )
14
+ true
15
+
16
+ iex> Guarder.pass?(
17
+ iex> %{html_tag: "img", content_type: "text/css"}
18
+ iex> )
19
+ false
20
+
21
+ iex> Guarder.pass?(
22
+ iex> %{html_tag: "link", content_type: "text/css"}
23
+ iex> )
24
+ true
25
+
26
+ iex> Guarder.pass?(
27
+ iex> %{html_tag: "link", content_type: "image/png"}
28
+ iex> )
29
+ false
30
+ """
31
+ def pass?(opts) do
32
+ is_text_link?(opts[:html_tag]) && is_text_file?(opts[:content_type])
33
+ end
34
+
35
+ defp is_text_link?(html_tag), do: Enum.member?(["a", "link"], html_tag)
36
+
37
+ defp is_text_file?(content_type), do: String.starts_with?(content_type, "text")
38
+ end
@@ -0,0 +1,41 @@
1
+ defmodule Crawler.Parser.HtmlParser do
2
+ @moduledoc """
3
+ Parses HTML files.
4
+ """
5
+
6
+ @tag_selectors %{
7
+ "pages" => "a",
8
+ "js" => "script[type='text/javascript'][src]",
9
+ "css" => "link[rel='stylesheet']",
10
+ "images" => "img"
11
+ }
12
+
13
+ @doc """
14
+ Parses HTML files.
15
+
16
+ ## Examples
17
+
18
+ iex> HtmlParser.parse(
19
+ iex> "<a href='http://hello.world'>Link</a>",
20
+ iex> %{}
21
+ iex> )
22
+ [{"a", [{"href", "http://hello.world"}], ["Link"]}]
23
+
24
+ iex> HtmlParser.parse(
25
+ iex> "<script type='text/javascript'>js</script>",
26
+ iex> %{assets: ["js"]}
27
+ iex> )
28
+ []
29
+ """
30
+ def parse(body, opts) do
31
+ {:ok, document} = Floki.parse_document(body)
32
+ Floki.find(document, selectors(opts))
33
+ end
34
+
35
+ defp selectors(opts) do
36
+ @tag_selectors
37
+ |> Map.take(["pages"] ++ (opts[:assets] || []))
38
+ |> Map.values()
39
+ |> Enum.join(", ")
40
+ end
41
+ end
@@ -0,0 +1,32 @@
1
+ defmodule Crawler.Parser.LinkParser.LinkExpander do
2
+ @moduledoc """
3
+ Expands a link into a full URL.
4
+ """
5
+
6
+ alias Crawler.Linker
7
+
8
+ @doc """
9
+ Expands a link into a full URL.
10
+
11
+ ## Examples
12
+
13
+ iex> LinkExpander.expand({"href", "http://hello.world"}, %{})
14
+ {"href", "http://hello.world"}
15
+
16
+ iex> LinkExpander.expand({"href", "page"}, %{referrer_url: "http://hello.world"})
17
+ {"link", "page", "href", "http://hello.world/page"}
18
+ """
19
+ def expand({_src, link} = element, opts) do
20
+ link
21
+ |> is_url?()
22
+ |> transform_link(element, opts)
23
+ end
24
+
25
+ defp is_url?(link), do: String.contains?(link, "://")
26
+
27
+ defp transform_link(true, element, _opts), do: element
28
+
29
+ defp transform_link(false, {src, link}, opts) do
30
+ {"link", link, src, Linker.url(opts[:referrer_url], link)}
31
+ end
32
+ end
@@ -0,0 +1,50 @@
1
+ defmodule Crawler.Parser.LinkParser do
2
+ @moduledoc """
3
+ Parses links and transforms them if necessary.
4
+ """
5
+
6
+ alias Crawler.Parser.LinkParser.LinkExpander
7
+
8
+ @tag_attr %{
9
+ "a" => "href",
10
+ "link" => "href",
11
+ "script" => "src",
12
+ "img" => "src"
13
+ }
14
+
15
+ @doc """
16
+ Parses links and transforms them if necessary.
17
+
18
+ ## Examples
19
+
20
+ iex> LinkParser.parse(
21
+ iex> {"a", [{"hello", "world"}, {"href", "http://hello.world"}], []},
22
+ iex> %{},
23
+ iex> &Kernel.inspect(&1, Enum.into(&2, []))
24
+ iex> )
25
+ "{\\\"href\\\", \\\"http://hello.world\\\"}"
26
+
27
+ iex> LinkParser.parse(
28
+ iex> {"img", [{"hello", "world"}, {"src", "http://hello.world"}], []},
29
+ iex> %{},
30
+ iex> &Kernel.inspect(&1, Enum.into(&2, []))
31
+ iex> )
32
+ "{\\\"src\\\", \\\"http://hello.world\\\"}"
33
+ """
34
+ def parse({tag, attrs, _}, opts, link_handler) do
35
+ src = @tag_attr[tag]
36
+
37
+ with {_tag, link} <- detect_link(src, attrs),
38
+ element <- LinkExpander.expand({src, link}, opts) do
39
+ opts = Map.merge(opts, %{html_tag: tag})
40
+
41
+ link_handler.(element, opts)
42
+ end
43
+ end
44
+
45
+ defp detect_link(src, attrs) do
46
+ Enum.find(attrs, fn attr ->
47
+ Kernel.match?({^src, _link}, attr)
48
+ end)
49
+ end
50
+ end
@@ -0,0 +1,122 @@
1
+ defmodule Crawler.Parser do
2
+ @moduledoc """
3
+ Parses pages and calls a link handler to handle the detected links.
4
+ """
5
+
6
+ alias Crawler.Dispatcher
7
+ alias Crawler.Parser.CssParser
8
+ alias Crawler.Parser.Guarder
9
+ alias Crawler.Parser.HtmlParser
10
+ alias Crawler.Parser.LinkParser
11
+
12
+ require Logger
13
+
14
+ defmodule Spec do
15
+ @moduledoc """
16
+ Spec for defining a parser.
17
+ """
18
+
19
+ alias Crawler.Store.Page
20
+
21
+ @type url :: String.t()
22
+ @type body :: String.t()
23
+ @type opts :: map
24
+ @type page :: %Page{url: url, body: body, opts: opts}
25
+
26
+ @callback parse(page) :: {:ok, page}
27
+ @callback parse({:error, term}) :: :ok
28
+ end
29
+
30
+ @behaviour __MODULE__.Spec
31
+
32
+ @doc """
33
+ Parses the links and returns the page.
34
+
35
+ There are two hooks:
36
+
37
+ - `link_handler` is useful when a custom parser calls this default parser and
38
+ utilises a different link handler for processing links.
39
+ - `scraper` is useful for scraping content immediately as the parser parses
40
+ the page, alternatively you can simply access the crawled data
41
+ asynchronously, refer to the [README](https://github.com/fredwu/crawler#usage)
42
+
43
+ ## Examples
44
+
45
+ iex> {:ok, page} = Parser.parse(%Page{
46
+ iex> body: "Body",
47
+ iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html"}
48
+ iex> })
49
+ iex> page.body
50
+ "Body"
51
+
52
+ iex> {:ok, page} = Parser.parse(%Page{
53
+ iex> body: "<a href='http://parser/1'>Link</a>",
54
+ iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html"}
55
+ iex> })
56
+ iex> page.body
57
+ "<a href='http://parser/1'>Link</a>"
58
+
59
+ iex> {:ok, page} = Parser.parse(%Page{
60
+ iex> body: "<a name='hello'>Link</a>",
61
+ iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html"}
62
+ iex> })
63
+ iex> page.body
64
+ "<a name='hello'>Link</a>"
65
+
66
+ iex> {:ok, page} = Parser.parse(%Page{
67
+ iex> body: "<a href='http://parser/2' target='_blank'>Link</a>",
68
+ iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html"}
69
+ iex> })
70
+ iex> page.body
71
+ "<a href='http://parser/2' target='_blank'>Link</a>"
72
+
73
+ iex> {:ok, page} = Parser.parse(%Page{
74
+ iex> body: "<a href='parser/2'>Link</a>",
75
+ iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html", referrer_url: "http://hello"}
76
+ iex> })
77
+ iex> page.body
78
+ "<a href='parser/2'>Link</a>"
79
+
80
+ iex> {:ok, page} = Parser.parse(%Page{
81
+ iex> body: "<a href='../parser/2'>Link</a>",
82
+ iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html", referrer_url: "http://hello"}
83
+ iex> })
84
+ iex> page.body
85
+ "<a href='../parser/2'>Link</a>"
86
+
87
+ iex> {:ok, page} = Parser.parse(%Page{
88
+ iex> body: image_file(),
89
+ iex> opts: %{scraper: Crawler.Scraper, html_tag: "img", content_type: "image/png"}
90
+ iex> })
91
+ iex> page.body
92
+ "\#{image_file()}"
93
+ """
94
+ def parse(input)
95
+
96
+ def parse({:warn, reason}), do: Logger.debug(fn -> "#{inspect(reason)}" end)
97
+ def parse({:error, reason}), do: Logger.error(fn -> "#{inspect(reason)}" end)
98
+
99
+ def parse(%{body: body, opts: opts} = page) do
100
+ parse_links(body, opts, &Dispatcher.dispatch(&1, &2))
101
+
102
+ {:ok, _page} = opts[:scraper].scrape(page)
103
+ end
104
+
105
+ def parse_links(body, opts, link_handler) do
106
+ opts
107
+ |> Guarder.pass?()
108
+ |> do_parse_links(body, opts, link_handler)
109
+ end
110
+
111
+ defp do_parse_links(false, _body, _opts, _link_handler), do: []
112
+
113
+ defp do_parse_links(true, body, opts, link_handler) do
114
+ Enum.map(
115
+ parse_file(body, opts),
116
+ &LinkParser.parse(&1, opts, link_handler)
117
+ )
118
+ end
119
+
120
+ defp parse_file(body, %{content_type: "text/css"}), do: CssParser.parse(body)
121
+ defp parse_file(body, opts), do: HtmlParser.parse(body, opts)
122
+ end
@@ -0,0 +1,45 @@
1
+ defmodule Crawler.QueueHandler do
2
+ @moduledoc """
3
+ Handles the queueing of crawl requests.
4
+ """
5
+
6
+ alias Crawler.Dispatcher.Worker
7
+
8
+ @doc """
9
+ Enqueues a crawl request.
10
+
11
+ Also initialises the queue if it's not already initialised, this is necessary
12
+ so that consumer apps don't have to manually handle the queue initialisation.
13
+ """
14
+ def enqueue(opts) do
15
+ opts = init_queue(opts[:queue], opts)
16
+
17
+ OPQ.enqueue(opts[:queue], opts)
18
+
19
+ {:ok, opts}
20
+ end
21
+
22
+ defp init_queue(nil, opts) do
23
+ {:ok, _} =
24
+ DynamicSupervisor.start_child(
25
+ Crawler.QueueSupervisor,
26
+ {OPQ,
27
+ [
28
+ worker: Worker,
29
+ workers: opts[:workers],
30
+ interval: opts[:interval],
31
+ timeout: opts[:timeout]
32
+ ]}
33
+ )
34
+
35
+ pid =
36
+ Crawler.QueueSupervisor
37
+ |> Supervisor.which_children()
38
+ |> List.last()
39
+ |> elem(1)
40
+
41
+ Map.merge(opts, %{queue: pid})
42
+ end
43
+
44
+ defp init_queue(_queue, opts), do: opts
45
+ end
@@ -0,0 +1,28 @@
1
+ defmodule Crawler.Scraper do
2
+ @moduledoc """
3
+ A placeholder module that demonstrates the scraping interface.
4
+ """
5
+
6
+ defmodule Spec do
7
+ @moduledoc """
8
+ Spec for defining a scraper.
9
+ """
10
+
11
+ alias Crawler.Store.Page
12
+
13
+ @type url :: String.t()
14
+ @type body :: String.t()
15
+ @type opts :: map
16
+ @type page :: %Page{url: url, body: body, opts: opts}
17
+
18
+ @callback scrape(page) :: {:ok, page}
19
+ end
20
+
21
+ alias Crawler.Store.Page
22
+
23
+ @behaviour __MODULE__.Spec
24
+
25
+ @doc """
26
+ """
27
+ def scrape(%Page{url: _url, body: _body, opts: _opts} = page), do: {:ok, page}
28
+ end
@@ -0,0 +1,45 @@
1
+ defmodule Crawler.Snapper.DirMaker do
2
+ @moduledoc """
3
+ Makes a new (nested) folder according to the options provided.
4
+ """
5
+
6
+ alias Crawler.Linker
7
+ alias Crawler.Linker.PathFinder
8
+
9
+ @doc """
10
+ Makes a new (nested) folder according to the options provided.
11
+
12
+ ## Examples
13
+
14
+ iex> DirMaker.make_dir(
15
+ iex> save_to: tmp("snapper/dir_creator"),
16
+ iex> url: "http://hello-world.local"
17
+ iex> )
18
+ iex> |> Path.relative_to_cwd
19
+ "test/tmp/snapper/dir_creator/hello-world.local/index.html"
20
+ """
21
+ def make_dir(opts) do
22
+ opts[:url]
23
+ |> prep_filepath()
24
+ |> build_save_path(opts[:save_to])
25
+ |> make_save_path(opts[:save_to])
26
+ end
27
+
28
+ defp prep_filepath(url) do
29
+ url
30
+ |> Linker.offline_url(url)
31
+ |> PathFinder.find_path()
32
+ end
33
+
34
+ defp build_save_path(path, save_to) do
35
+ Path.join(save_to, path)
36
+ end
37
+
38
+ defp make_save_path(path, save_to) do
39
+ if File.exists?(save_to) do
40
+ File.mkdir_p(Path.dirname(path))
41
+ end
42
+
43
+ path
44
+ end
45
+ end
@@ -0,0 +1,95 @@
1
+ defmodule Crawler.Snapper.LinkReplacer do
2
+ @moduledoc """
3
+ Replaces links found in a page so they work offline.
4
+ """
5
+
6
+ alias Crawler.Linker
7
+ alias Crawler.Parser
8
+
9
+ @doc """
10
+ Replaces links found in a page so they work offline.
11
+
12
+ ## Examples
13
+
14
+ iex> LinkReplacer.replace_links(
15
+ iex> "<a href='http://another.domain/page.html'></a>",
16
+ iex> %{
17
+ iex> url: "http://main.domain/dir/page",
18
+ iex> depth: 1,
19
+ iex> max_depths: 2,
20
+ iex> html_tag: "a",
21
+ iex> content_type: "text/html",
22
+ iex> }
23
+ iex> )
24
+ {:ok, "<a href='../../../another.domain/page.html'></a>"}
25
+
26
+ iex> LinkReplacer.replace_links(
27
+ iex> "<a href='http://another.domain/dir/page.html'></a>",
28
+ iex> %{
29
+ iex> url: "http://main.domain/page",
30
+ iex> depth: 1,
31
+ iex> max_depths: 2,
32
+ iex> html_tag: "a",
33
+ iex> content_type: "text/html",
34
+ iex> }
35
+ iex> )
36
+ {:ok, "<a href='../../another.domain/dir/page.html'></a>"}
37
+
38
+ iex> LinkReplacer.replace_links(
39
+ iex> "<a href='http://another.domain/dir/page'></a>",
40
+ iex> %{
41
+ iex> url: "http://main.domain/dir/page",
42
+ iex> depth: 1,
43
+ iex> max_depths: 2,
44
+ iex> html_tag: "a",
45
+ iex> content_type: "text/html",
46
+ iex> }
47
+ iex> )
48
+ {:ok, "<a href='../../../another.domain/dir/page/index.html'></a>"}
49
+
50
+ iex> LinkReplacer.replace_links(
51
+ iex> "<a href='/dir/page2.html'></a>",
52
+ iex> %{
53
+ iex> url: "http://main.domain/dir/page",
54
+ iex> referrer_url: "http://main.domain/dir/page",
55
+ iex> depth: 1,
56
+ iex> max_depths: 2,
57
+ iex> html_tag: "a",
58
+ iex> content_type: "text/html",
59
+ iex> }
60
+ iex> )
61
+ {:ok, "<a href='../../../main.domain/dir/page2.html'></a>"}
62
+ """
63
+ def replace_links(body, opts) do
64
+ new_body =
65
+ body
66
+ |> Parser.parse_links(opts, &get_link/2)
67
+ |> List.flatten()
68
+ |> Enum.reject(&(&1 == nil))
69
+ |> Enum.reduce(body, &modify_body(opts[:content_type], &2, opts[:url], &1))
70
+
71
+ {:ok, new_body}
72
+ end
73
+
74
+ defp get_link({_, url}, _opts), do: url
75
+ defp get_link({_, link, _, url}, _opts), do: [link, url]
76
+
77
+ defp modify_body(content_type, body, current_url, link) do
78
+ String.replace(
79
+ body,
80
+ regexes(content_type, link),
81
+ modify_link(current_url, link)
82
+ )
83
+ end
84
+
85
+ defp regexes(content_type, link) do
86
+ case content_type do
87
+ "text/css" -> ~r{((?!url)\(['"]?)#{link}(['"]?\))}
88
+ _ -> ~r{((?!src|href)=['"])#{link}(['"])}
89
+ end
90
+ end
91
+
92
+ defp modify_link(current_url, link) do
93
+ "\\1" <> Linker.offline_link(current_url, link) <> "\\2"
94
+ end
95
+ end