powerdlz23 1.2.3 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Spider/README.md +19 -0
- package/Spider/domain.py +18 -0
- package/Spider/general.py +51 -0
- package/Spider/link_finder.py +25 -0
- package/Spider/main.py +50 -0
- package/Spider/spider.py +74 -0
- package/crawler/.formatter.exs +5 -0
- package/crawler/.github/workflows/ci.yml +29 -0
- package/crawler/.recode.exs +33 -0
- package/crawler/.tool-versions +2 -0
- package/crawler/CHANGELOG.md +82 -0
- package/crawler/README.md +198 -0
- package/crawler/architecture.svg +4 -0
- package/crawler/config/config.exs +9 -0
- package/crawler/config/dev.exs +5 -0
- package/crawler/config/test.exs +5 -0
- package/crawler/examples/google_search/scraper.ex +37 -0
- package/crawler/examples/google_search/url_filter.ex +11 -0
- package/crawler/examples/google_search.ex +77 -0
- package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
- package/crawler/lib/crawler/dispatcher.ex +20 -0
- package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
- package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
- package/crawler/lib/crawler/fetcher/policer.ex +77 -0
- package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
- package/crawler/lib/crawler/fetcher/requester.ex +32 -0
- package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
- package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
- package/crawler/lib/crawler/fetcher.ex +81 -0
- package/crawler/lib/crawler/http.ex +7 -0
- package/crawler/lib/crawler/linker/path_builder.ex +71 -0
- package/crawler/lib/crawler/linker/path_expander.ex +59 -0
- package/crawler/lib/crawler/linker/path_finder.ex +106 -0
- package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
- package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
- package/crawler/lib/crawler/linker.ex +173 -0
- package/crawler/lib/crawler/options.ex +127 -0
- package/crawler/lib/crawler/parser/css_parser.ex +37 -0
- package/crawler/lib/crawler/parser/guarder.ex +38 -0
- package/crawler/lib/crawler/parser/html_parser.ex +41 -0
- package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
- package/crawler/lib/crawler/parser/link_parser.ex +50 -0
- package/crawler/lib/crawler/parser.ex +122 -0
- package/crawler/lib/crawler/queue_handler.ex +45 -0
- package/crawler/lib/crawler/scraper.ex +28 -0
- package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
- package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
- package/crawler/lib/crawler/snapper.ex +82 -0
- package/crawler/lib/crawler/store/counter.ex +19 -0
- package/crawler/lib/crawler/store/page.ex +7 -0
- package/crawler/lib/crawler/store.ex +87 -0
- package/crawler/lib/crawler/worker.ex +62 -0
- package/crawler/lib/crawler.ex +91 -0
- package/crawler/mix.exs +78 -0
- package/crawler/mix.lock +40 -0
- package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
- package/crawler/test/integration_test.exs +135 -0
- package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
- package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
- package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
- package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
- package/crawler/test/lib/crawler/http_test.exs +47 -0
- package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
- package/crawler/test/lib/crawler/linker_test.exs +7 -0
- package/crawler/test/lib/crawler/options_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser_test.exs +8 -0
- package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
- package/crawler/test/lib/crawler/scraper_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper_test.exs +9 -0
- package/crawler/test/lib/crawler/worker_test.exs +5 -0
- package/crawler/test/lib/crawler_test.exs +295 -0
- package/crawler/test/support/test_case.ex +24 -0
- package/crawler/test/support/test_helpers.ex +28 -0
- package/crawler/test/test_helper.exs +7 -0
- package/grell/.rspec +2 -0
- package/grell/.travis.yml +28 -0
- package/grell/CHANGELOG.md +111 -0
- package/grell/Gemfile +7 -0
- package/grell/LICENSE.txt +22 -0
- package/grell/README.md +213 -0
- package/grell/Rakefile +2 -0
- package/grell/grell.gemspec +36 -0
- package/grell/lib/grell/capybara_driver.rb +44 -0
- package/grell/lib/grell/crawler.rb +83 -0
- package/grell/lib/grell/crawler_manager.rb +84 -0
- package/grell/lib/grell/grell_logger.rb +10 -0
- package/grell/lib/grell/page.rb +275 -0
- package/grell/lib/grell/page_collection.rb +62 -0
- package/grell/lib/grell/rawpage.rb +62 -0
- package/grell/lib/grell/reader.rb +18 -0
- package/grell/lib/grell/version.rb +3 -0
- package/grell/lib/grell.rb +11 -0
- package/grell/spec/lib/capybara_driver_spec.rb +38 -0
- package/grell/spec/lib/crawler_manager_spec.rb +174 -0
- package/grell/spec/lib/crawler_spec.rb +361 -0
- package/grell/spec/lib/page_collection_spec.rb +159 -0
- package/grell/spec/lib/page_spec.rb +418 -0
- package/grell/spec/lib/reader_spec.rb +43 -0
- package/grell/spec/spec_helper.rb +66 -0
- package/heartmagic/config.py +1 -0
- package/heartmagic/heart.py +3 -0
- package/heartmagic/pytransform/__init__.py +483 -0
- package/heartmagic/pytransform/_pytransform.dll +0 -0
- package/heartmagic/pytransform/_pytransform.so +0 -0
- package/httpStatusCode/README.md +2 -0
- package/httpStatusCode/httpStatusCode.js +4 -0
- package/httpStatusCode/reasonPhrases.js +344 -0
- package/httpStatusCode/statusCodes.js +344 -0
- package/package.json +1 -1
- package/rubyretriever/.rspec +2 -0
- package/rubyretriever/.travis.yml +7 -0
- package/rubyretriever/Gemfile +3 -0
- package/rubyretriever/Gemfile.lock +64 -0
- package/rubyretriever/LICENSE +20 -0
- package/rubyretriever/Rakefile +7 -0
- package/rubyretriever/bin/rr +79 -0
- package/rubyretriever/lib/retriever/cli.rb +25 -0
- package/rubyretriever/lib/retriever/core_ext.rb +13 -0
- package/rubyretriever/lib/retriever/fetch.rb +268 -0
- package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
- package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
- package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
- package/rubyretriever/lib/retriever/link.rb +47 -0
- package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
- package/rubyretriever/lib/retriever/page.rb +104 -0
- package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
- package/rubyretriever/lib/retriever/target.rb +47 -0
- package/rubyretriever/lib/retriever/version.rb +4 -0
- package/rubyretriever/lib/retriever.rb +15 -0
- package/rubyretriever/readme.md +166 -0
- package/rubyretriever/rubyretriever.gemspec +41 -0
- package/rubyretriever/spec/link_spec.rb +77 -0
- package/rubyretriever/spec/page_spec.rb +94 -0
- package/rubyretriever/spec/retriever_spec.rb +84 -0
- package/rubyretriever/spec/spec_helper.rb +17 -0
- package/rubyretriever/spec/target_spec.rb +55 -0
- package/snapcrawl/.changelog.old.md +157 -0
- package/snapcrawl/.gitattributes +1 -0
- package/snapcrawl/.github/workflows/test.yml +41 -0
- package/snapcrawl/.rspec +3 -0
- package/snapcrawl/.rubocop.yml +23 -0
- package/snapcrawl/CHANGELOG.md +182 -0
- package/snapcrawl/Gemfile +15 -0
- package/snapcrawl/LICENSE +21 -0
- package/snapcrawl/README.md +135 -0
- package/snapcrawl/Runfile +35 -0
- package/snapcrawl/bin/snapcrawl +25 -0
- package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
- package/snapcrawl/lib/snapcrawl/config.rb +60 -0
- package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
- package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
- package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
- package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
- package/snapcrawl/lib/snapcrawl/page.rb +118 -0
- package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
- package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
- package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
- package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
- package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
- package/snapcrawl/lib/snapcrawl/version.rb +3 -0
- package/snapcrawl/lib/snapcrawl.rb +20 -0
- package/snapcrawl/snapcrawl.gemspec +27 -0
- package/snapcrawl/snapcrawl.yml +41 -0
- package/snapcrawl/spec/README.md +16 -0
- package/snapcrawl/spec/approvals/bin/help +26 -0
- package/snapcrawl/spec/approvals/bin/usage +4 -0
- package/snapcrawl/spec/approvals/cli/usage +4 -0
- package/snapcrawl/spec/approvals/config/defaults +15 -0
- package/snapcrawl/spec/approvals/config/minimal +15 -0
- package/snapcrawl/spec/approvals/integration/blacklist +14 -0
- package/snapcrawl/spec/approvals/integration/default-config +14 -0
- package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
- package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
- package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
- package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
- package/snapcrawl/spec/approvals/integration/whitelist +14 -0
- package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
- package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
- package/snapcrawl/spec/server/config.ru +97 -0
- package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
- package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
- package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
- package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
- package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
- package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
- package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
- package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
- package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
- package/snapcrawl/spec/spec_helper.rb +22 -0
- package/snapcrawl/spec/spec_mixin.rb +10 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
defmodule Crawler.Options do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Options for the crawler.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Mixfile
|
|
7
|
+
alias Crawler.Store
|
|
8
|
+
|
|
9
|
+
@assets []
|
|
10
|
+
@save_to nil
|
|
11
|
+
@workers 10
|
|
12
|
+
@interval 0
|
|
13
|
+
@max_depths 3
|
|
14
|
+
@max_pages :infinity
|
|
15
|
+
@timeout 5_000
|
|
16
|
+
@retries 2
|
|
17
|
+
@store nil
|
|
18
|
+
@force false
|
|
19
|
+
@scope nil
|
|
20
|
+
@user_agent "Crawler/#{Mixfile.project()[:version]} (https://github.com/fredwu/crawler)"
|
|
21
|
+
@url_filter Crawler.Fetcher.UrlFilter
|
|
22
|
+
@retrier Crawler.Fetcher.Retrier
|
|
23
|
+
@modifier Crawler.Fetcher.Modifier
|
|
24
|
+
@scraper Crawler.Scraper
|
|
25
|
+
@parser Crawler.Parser
|
|
26
|
+
@encode_uri false
|
|
27
|
+
@queue nil
|
|
28
|
+
|
|
29
|
+
@doc """
|
|
30
|
+
Assigns default option values.
|
|
31
|
+
|
|
32
|
+
## Examples
|
|
33
|
+
|
|
34
|
+
iex> Options.assign_defaults(%{}) |> Map.has_key?(:depth)
|
|
35
|
+
true
|
|
36
|
+
|
|
37
|
+
iex> Options.assign_defaults(%{}) |> Map.get(:max_depths)
|
|
38
|
+
3
|
|
39
|
+
|
|
40
|
+
iex> Options.assign_defaults(%{max_depths: 4}) |> Map.get(:max_depths)
|
|
41
|
+
4
|
|
42
|
+
"""
|
|
43
|
+
def assign_defaults(opts) do
|
|
44
|
+
Map.merge(
|
|
45
|
+
%{
|
|
46
|
+
depth: 0,
|
|
47
|
+
html_tag: "a",
|
|
48
|
+
assets: assets(),
|
|
49
|
+
save_to: save_to(),
|
|
50
|
+
workers: workers(),
|
|
51
|
+
interval: interval(),
|
|
52
|
+
max_depths: max_depths(),
|
|
53
|
+
max_pages: max_pages(),
|
|
54
|
+
timeout: timeout(),
|
|
55
|
+
retries: retries(),
|
|
56
|
+
store: store(),
|
|
57
|
+
force: force(),
|
|
58
|
+
scope: scope(),
|
|
59
|
+
user_agent: user_agent(),
|
|
60
|
+
url_filter: url_filter(),
|
|
61
|
+
retrier: retrier(),
|
|
62
|
+
modifier: modifier(),
|
|
63
|
+
scraper: scraper(),
|
|
64
|
+
parser: parser(),
|
|
65
|
+
encode_uri: encode_uri(),
|
|
66
|
+
queue: queue()
|
|
67
|
+
},
|
|
68
|
+
opts
|
|
69
|
+
)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
@doc """
|
|
73
|
+
Takes the `url` argument and puts it in the `opts`.
|
|
74
|
+
|
|
75
|
+
The `opts` map gets passed around internally and eventually gets stored in
|
|
76
|
+
the registry.
|
|
77
|
+
|
|
78
|
+
## Examples
|
|
79
|
+
|
|
80
|
+
iex> Options.assign_url(%{}, "http://options/")
|
|
81
|
+
%{url: "http://options/"}
|
|
82
|
+
|
|
83
|
+
iex> Options.assign_url(%{url: "http://example.com/"}, "http://options/")
|
|
84
|
+
%{url: "http://options/"}
|
|
85
|
+
"""
|
|
86
|
+
def assign_url(%{encode_uri: true} = opts, url) do
|
|
87
|
+
Map.merge(opts, %{url: URI.encode(url)})
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def assign_url(opts, url) do
|
|
91
|
+
Map.merge(opts, %{url: url})
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def assign_scope(%{force: true, scope: nil} = opts) do
|
|
95
|
+
Map.merge(opts, %{scope: System.unique_integer()})
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def assign_scope(opts), do: opts
|
|
99
|
+
|
|
100
|
+
def perform_default_actions(%{depth: 0} = opts) do
|
|
101
|
+
Store.ops_reset()
|
|
102
|
+
|
|
103
|
+
opts
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def perform_default_actions(opts), do: opts
|
|
107
|
+
|
|
108
|
+
defp assets, do: Application.get_env(:crawler, :assets, @assets)
|
|
109
|
+
defp save_to, do: Application.get_env(:crawler, :save_to, @save_to)
|
|
110
|
+
defp workers, do: Application.get_env(:crawler, :workers, @workers)
|
|
111
|
+
defp interval, do: Application.get_env(:crawler, :interval, @interval)
|
|
112
|
+
defp max_depths, do: Application.get_env(:crawler, :max_depths, @max_depths)
|
|
113
|
+
defp max_pages, do: Application.get_env(:crawler, :max_pages, @max_pages)
|
|
114
|
+
defp timeout, do: Application.get_env(:crawler, :timeout, @timeout)
|
|
115
|
+
defp retries, do: Application.get_env(:crawler, :retries, @retries)
|
|
116
|
+
defp store, do: Application.get_env(:crawler, :store, @store)
|
|
117
|
+
defp force, do: Application.get_env(:crawler, :force, @force)
|
|
118
|
+
defp scope, do: Application.get_env(:crawler, :scope, @scope)
|
|
119
|
+
defp user_agent, do: Application.get_env(:crawler, :user_agent, @user_agent)
|
|
120
|
+
defp url_filter, do: Application.get_env(:crawler, :url_filter, @url_filter)
|
|
121
|
+
defp retrier, do: Application.get_env(:crawler, :retrier, @retrier)
|
|
122
|
+
defp modifier, do: Application.get_env(:crawler, :modifier, @modifier)
|
|
123
|
+
defp scraper, do: Application.get_env(:crawler, :scraper, @scraper)
|
|
124
|
+
defp parser, do: Application.get_env(:crawler, :parser, @parser)
|
|
125
|
+
defp encode_uri, do: Application.get_env(:crawler, :encode_uri, @encode_uri)
|
|
126
|
+
defp queue, do: Application.get_env(:crawler, :queue, @queue)
|
|
127
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
defmodule Crawler.Parser.CssParser do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Parses CSS files.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
@url_unsafe_chars ")'\""
|
|
7
|
+
|
|
8
|
+
@doc """
|
|
9
|
+
Parses CSS files.
|
|
10
|
+
|
|
11
|
+
## Examples
|
|
12
|
+
|
|
13
|
+
iex> CssParser.parse(
|
|
14
|
+
iex> "img { url(http://hello.world) }"
|
|
15
|
+
iex> )
|
|
16
|
+
[{"link", [{"href", "http://hello.world"}], []}]
|
|
17
|
+
|
|
18
|
+
iex> CssParser.parse(
|
|
19
|
+
iex> "@font-face { src: url('icons.ttf') format('truetype'); }"
|
|
20
|
+
iex> )
|
|
21
|
+
[{"link", [{"href", "icons.ttf"}], []}]
|
|
22
|
+
|
|
23
|
+
iex> CssParser.parse(
|
|
24
|
+
iex> "@font-face { src: url('data:application/blah'); }"
|
|
25
|
+
iex> )
|
|
26
|
+
[]
|
|
27
|
+
"""
|
|
28
|
+
def parse(body) do
|
|
29
|
+
~r{url\(['"]?((?!data:)[^#{@url_unsafe_chars}]+)['"]?\)}
|
|
30
|
+
|> Regex.scan(body, capture: :all_but_first)
|
|
31
|
+
|> Enum.map(&prep_css_element/1)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
defp prep_css_element([link]) do
|
|
35
|
+
{"link", [{"href", link}], []}
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
defmodule Crawler.Parser.Guarder do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Detects whether a page is parsable.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
@doc """
|
|
7
|
+
Detects whether a page is parsable.
|
|
8
|
+
|
|
9
|
+
## Examples
|
|
10
|
+
|
|
11
|
+
iex> Guarder.pass?(
|
|
12
|
+
iex> %{html_tag: "link", content_type: "text/css"}
|
|
13
|
+
iex> )
|
|
14
|
+
true
|
|
15
|
+
|
|
16
|
+
iex> Guarder.pass?(
|
|
17
|
+
iex> %{html_tag: "img", content_type: "text/css"}
|
|
18
|
+
iex> )
|
|
19
|
+
false
|
|
20
|
+
|
|
21
|
+
iex> Guarder.pass?(
|
|
22
|
+
iex> %{html_tag: "link", content_type: "text/css"}
|
|
23
|
+
iex> )
|
|
24
|
+
true
|
|
25
|
+
|
|
26
|
+
iex> Guarder.pass?(
|
|
27
|
+
iex> %{html_tag: "link", content_type: "image/png"}
|
|
28
|
+
iex> )
|
|
29
|
+
false
|
|
30
|
+
"""
|
|
31
|
+
def pass?(opts) do
|
|
32
|
+
is_text_link?(opts[:html_tag]) && is_text_file?(opts[:content_type])
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
defp is_text_link?(html_tag), do: Enum.member?(["a", "link"], html_tag)
|
|
36
|
+
|
|
37
|
+
defp is_text_file?(content_type), do: String.starts_with?(content_type, "text")
|
|
38
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
defmodule Crawler.Parser.HtmlParser do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Parses HTML files.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
@tag_selectors %{
|
|
7
|
+
"pages" => "a",
|
|
8
|
+
"js" => "script[type='text/javascript'][src]",
|
|
9
|
+
"css" => "link[rel='stylesheet']",
|
|
10
|
+
"images" => "img"
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
@doc """
|
|
14
|
+
Parses HTML files.
|
|
15
|
+
|
|
16
|
+
## Examples
|
|
17
|
+
|
|
18
|
+
iex> HtmlParser.parse(
|
|
19
|
+
iex> "<a href='http://hello.world'>Link</a>",
|
|
20
|
+
iex> %{}
|
|
21
|
+
iex> )
|
|
22
|
+
[{"a", [{"href", "http://hello.world"}], ["Link"]}]
|
|
23
|
+
|
|
24
|
+
iex> HtmlParser.parse(
|
|
25
|
+
iex> "<script type='text/javascript'>js</script>",
|
|
26
|
+
iex> %{assets: ["js"]}
|
|
27
|
+
iex> )
|
|
28
|
+
[]
|
|
29
|
+
"""
|
|
30
|
+
def parse(body, opts) do
|
|
31
|
+
{:ok, document} = Floki.parse_document(body)
|
|
32
|
+
Floki.find(document, selectors(opts))
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
defp selectors(opts) do
|
|
36
|
+
@tag_selectors
|
|
37
|
+
|> Map.take(["pages"] ++ (opts[:assets] || []))
|
|
38
|
+
|> Map.values()
|
|
39
|
+
|> Enum.join(", ")
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
defmodule Crawler.Parser.LinkParser.LinkExpander do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Expands a link into a full URL.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Linker
|
|
7
|
+
|
|
8
|
+
@doc """
|
|
9
|
+
Expands a link into a full URL.
|
|
10
|
+
|
|
11
|
+
## Examples
|
|
12
|
+
|
|
13
|
+
iex> LinkExpander.expand({"href", "http://hello.world"}, %{})
|
|
14
|
+
{"href", "http://hello.world"}
|
|
15
|
+
|
|
16
|
+
iex> LinkExpander.expand({"href", "page"}, %{referrer_url: "http://hello.world"})
|
|
17
|
+
{"link", "page", "href", "http://hello.world/page"}
|
|
18
|
+
"""
|
|
19
|
+
def expand({_src, link} = element, opts) do
|
|
20
|
+
link
|
|
21
|
+
|> is_url?()
|
|
22
|
+
|> transform_link(element, opts)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
defp is_url?(link), do: String.contains?(link, "://")
|
|
26
|
+
|
|
27
|
+
defp transform_link(true, element, _opts), do: element
|
|
28
|
+
|
|
29
|
+
defp transform_link(false, {src, link}, opts) do
|
|
30
|
+
{"link", link, src, Linker.url(opts[:referrer_url], link)}
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
defmodule Crawler.Parser.LinkParser do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Parses links and transforms them if necessary.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Parser.LinkParser.LinkExpander
|
|
7
|
+
|
|
8
|
+
@tag_attr %{
|
|
9
|
+
"a" => "href",
|
|
10
|
+
"link" => "href",
|
|
11
|
+
"script" => "src",
|
|
12
|
+
"img" => "src"
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
@doc """
|
|
16
|
+
Parses links and transforms them if necessary.
|
|
17
|
+
|
|
18
|
+
## Examples
|
|
19
|
+
|
|
20
|
+
iex> LinkParser.parse(
|
|
21
|
+
iex> {"a", [{"hello", "world"}, {"href", "http://hello.world"}], []},
|
|
22
|
+
iex> %{},
|
|
23
|
+
iex> &Kernel.inspect(&1, Enum.into(&2, []))
|
|
24
|
+
iex> )
|
|
25
|
+
"{\\\"href\\\", \\\"http://hello.world\\\"}"
|
|
26
|
+
|
|
27
|
+
iex> LinkParser.parse(
|
|
28
|
+
iex> {"img", [{"hello", "world"}, {"src", "http://hello.world"}], []},
|
|
29
|
+
iex> %{},
|
|
30
|
+
iex> &Kernel.inspect(&1, Enum.into(&2, []))
|
|
31
|
+
iex> )
|
|
32
|
+
"{\\\"src\\\", \\\"http://hello.world\\\"}"
|
|
33
|
+
"""
|
|
34
|
+
def parse({tag, attrs, _}, opts, link_handler) do
|
|
35
|
+
src = @tag_attr[tag]
|
|
36
|
+
|
|
37
|
+
with {_tag, link} <- detect_link(src, attrs),
|
|
38
|
+
element <- LinkExpander.expand({src, link}, opts) do
|
|
39
|
+
opts = Map.merge(opts, %{html_tag: tag})
|
|
40
|
+
|
|
41
|
+
link_handler.(element, opts)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
defp detect_link(src, attrs) do
|
|
46
|
+
Enum.find(attrs, fn attr ->
|
|
47
|
+
Kernel.match?({^src, _link}, attr)
|
|
48
|
+
end)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
defmodule Crawler.Parser do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Parses pages and calls a link handler to handle the detected links.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Dispatcher
|
|
7
|
+
alias Crawler.Parser.CssParser
|
|
8
|
+
alias Crawler.Parser.Guarder
|
|
9
|
+
alias Crawler.Parser.HtmlParser
|
|
10
|
+
alias Crawler.Parser.LinkParser
|
|
11
|
+
|
|
12
|
+
require Logger
|
|
13
|
+
|
|
14
|
+
defmodule Spec do
|
|
15
|
+
@moduledoc """
|
|
16
|
+
Spec for defining a parser.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
alias Crawler.Store.Page
|
|
20
|
+
|
|
21
|
+
@type url :: String.t()
|
|
22
|
+
@type body :: String.t()
|
|
23
|
+
@type opts :: map
|
|
24
|
+
@type page :: %Page{url: url, body: body, opts: opts}
|
|
25
|
+
|
|
26
|
+
@callback parse(page) :: {:ok, page}
|
|
27
|
+
@callback parse({:error, term}) :: :ok
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
@behaviour __MODULE__.Spec
|
|
31
|
+
|
|
32
|
+
@doc """
|
|
33
|
+
Parses the links and returns the page.
|
|
34
|
+
|
|
35
|
+
There are two hooks:
|
|
36
|
+
|
|
37
|
+
- `link_handler` is useful when a custom parser calls this default parser and
|
|
38
|
+
utilises a different link handler for processing links.
|
|
39
|
+
- `scraper` is useful for scraping content immediately as the parser parses
|
|
40
|
+
the page, alternatively you can simply access the crawled data
|
|
41
|
+
asynchronously, refer to the [README](https://github.com/fredwu/crawler#usage)
|
|
42
|
+
|
|
43
|
+
## Examples
|
|
44
|
+
|
|
45
|
+
iex> {:ok, page} = Parser.parse(%Page{
|
|
46
|
+
iex> body: "Body",
|
|
47
|
+
iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html"}
|
|
48
|
+
iex> })
|
|
49
|
+
iex> page.body
|
|
50
|
+
"Body"
|
|
51
|
+
|
|
52
|
+
iex> {:ok, page} = Parser.parse(%Page{
|
|
53
|
+
iex> body: "<a href='http://parser/1'>Link</a>",
|
|
54
|
+
iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html"}
|
|
55
|
+
iex> })
|
|
56
|
+
iex> page.body
|
|
57
|
+
"<a href='http://parser/1'>Link</a>"
|
|
58
|
+
|
|
59
|
+
iex> {:ok, page} = Parser.parse(%Page{
|
|
60
|
+
iex> body: "<a name='hello'>Link</a>",
|
|
61
|
+
iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html"}
|
|
62
|
+
iex> })
|
|
63
|
+
iex> page.body
|
|
64
|
+
"<a name='hello'>Link</a>"
|
|
65
|
+
|
|
66
|
+
iex> {:ok, page} = Parser.parse(%Page{
|
|
67
|
+
iex> body: "<a href='http://parser/2' target='_blank'>Link</a>",
|
|
68
|
+
iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html"}
|
|
69
|
+
iex> })
|
|
70
|
+
iex> page.body
|
|
71
|
+
"<a href='http://parser/2' target='_blank'>Link</a>"
|
|
72
|
+
|
|
73
|
+
iex> {:ok, page} = Parser.parse(%Page{
|
|
74
|
+
iex> body: "<a href='parser/2'>Link</a>",
|
|
75
|
+
iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html", referrer_url: "http://hello"}
|
|
76
|
+
iex> })
|
|
77
|
+
iex> page.body
|
|
78
|
+
"<a href='parser/2'>Link</a>"
|
|
79
|
+
|
|
80
|
+
iex> {:ok, page} = Parser.parse(%Page{
|
|
81
|
+
iex> body: "<a href='../parser/2'>Link</a>",
|
|
82
|
+
iex> opts: %{scraper: Crawler.Scraper, html_tag: "a", content_type: "text/html", referrer_url: "http://hello"}
|
|
83
|
+
iex> })
|
|
84
|
+
iex> page.body
|
|
85
|
+
"<a href='../parser/2'>Link</a>"
|
|
86
|
+
|
|
87
|
+
iex> {:ok, page} = Parser.parse(%Page{
|
|
88
|
+
iex> body: image_file(),
|
|
89
|
+
iex> opts: %{scraper: Crawler.Scraper, html_tag: "img", content_type: "image/png"}
|
|
90
|
+
iex> })
|
|
91
|
+
iex> page.body
|
|
92
|
+
"\#{image_file()}"
|
|
93
|
+
"""
|
|
94
|
+
def parse(input)
|
|
95
|
+
|
|
96
|
+
def parse({:warn, reason}), do: Logger.debug(fn -> "#{inspect(reason)}" end)
|
|
97
|
+
def parse({:error, reason}), do: Logger.error(fn -> "#{inspect(reason)}" end)
|
|
98
|
+
|
|
99
|
+
def parse(%{body: body, opts: opts} = page) do
|
|
100
|
+
parse_links(body, opts, &Dispatcher.dispatch(&1, &2))
|
|
101
|
+
|
|
102
|
+
{:ok, _page} = opts[:scraper].scrape(page)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def parse_links(body, opts, link_handler) do
|
|
106
|
+
opts
|
|
107
|
+
|> Guarder.pass?()
|
|
108
|
+
|> do_parse_links(body, opts, link_handler)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
defp do_parse_links(false, _body, _opts, _link_handler), do: []
|
|
112
|
+
|
|
113
|
+
defp do_parse_links(true, body, opts, link_handler) do
|
|
114
|
+
Enum.map(
|
|
115
|
+
parse_file(body, opts),
|
|
116
|
+
&LinkParser.parse(&1, opts, link_handler)
|
|
117
|
+
)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
defp parse_file(body, %{content_type: "text/css"}), do: CssParser.parse(body)
|
|
121
|
+
defp parse_file(body, opts), do: HtmlParser.parse(body, opts)
|
|
122
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
defmodule Crawler.QueueHandler do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Handles the queueing of crawl requests.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Dispatcher.Worker
|
|
7
|
+
|
|
8
|
+
@doc """
|
|
9
|
+
Enqueues a crawl request.
|
|
10
|
+
|
|
11
|
+
Also initialises the queue if it's not already initialised, this is necessary
|
|
12
|
+
so that consumer apps don't have to manually handle the queue initialisation.
|
|
13
|
+
"""
|
|
14
|
+
def enqueue(opts) do
|
|
15
|
+
opts = init_queue(opts[:queue], opts)
|
|
16
|
+
|
|
17
|
+
OPQ.enqueue(opts[:queue], opts)
|
|
18
|
+
|
|
19
|
+
{:ok, opts}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
defp init_queue(nil, opts) do
|
|
23
|
+
{:ok, _} =
|
|
24
|
+
DynamicSupervisor.start_child(
|
|
25
|
+
Crawler.QueueSupervisor,
|
|
26
|
+
{OPQ,
|
|
27
|
+
[
|
|
28
|
+
worker: Worker,
|
|
29
|
+
workers: opts[:workers],
|
|
30
|
+
interval: opts[:interval],
|
|
31
|
+
timeout: opts[:timeout]
|
|
32
|
+
]}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
pid =
|
|
36
|
+
Crawler.QueueSupervisor
|
|
37
|
+
|> Supervisor.which_children()
|
|
38
|
+
|> List.last()
|
|
39
|
+
|> elem(1)
|
|
40
|
+
|
|
41
|
+
Map.merge(opts, %{queue: pid})
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
defp init_queue(_queue, opts), do: opts
|
|
45
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
defmodule Crawler.Scraper do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
A placeholder module that demonstrates the scraping interface.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
defmodule Spec do
|
|
7
|
+
@moduledoc """
|
|
8
|
+
Spec for defining a scraper.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
alias Crawler.Store.Page
|
|
12
|
+
|
|
13
|
+
@type url :: String.t()
|
|
14
|
+
@type body :: String.t()
|
|
15
|
+
@type opts :: map
|
|
16
|
+
@type page :: %Page{url: url, body: body, opts: opts}
|
|
17
|
+
|
|
18
|
+
@callback scrape(page) :: {:ok, page}
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
alias Crawler.Store.Page
|
|
22
|
+
|
|
23
|
+
@behaviour __MODULE__.Spec
|
|
24
|
+
|
|
25
|
+
@doc """
|
|
26
|
+
"""
|
|
27
|
+
def scrape(%Page{url: _url, body: _body, opts: _opts} = page), do: {:ok, page}
|
|
28
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
defmodule Crawler.Snapper.DirMaker do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Makes a new (nested) folder according to the options provided.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Linker
|
|
7
|
+
alias Crawler.Linker.PathFinder
|
|
8
|
+
|
|
9
|
+
@doc """
|
|
10
|
+
Makes a new (nested) folder according to the options provided.
|
|
11
|
+
|
|
12
|
+
## Examples
|
|
13
|
+
|
|
14
|
+
iex> DirMaker.make_dir(
|
|
15
|
+
iex> save_to: tmp("snapper/dir_creator"),
|
|
16
|
+
iex> url: "http://hello-world.local"
|
|
17
|
+
iex> )
|
|
18
|
+
iex> |> Path.relative_to_cwd
|
|
19
|
+
"test/tmp/snapper/dir_creator/hello-world.local/index.html"
|
|
20
|
+
"""
|
|
21
|
+
def make_dir(opts) do
|
|
22
|
+
opts[:url]
|
|
23
|
+
|> prep_filepath()
|
|
24
|
+
|> build_save_path(opts[:save_to])
|
|
25
|
+
|> make_save_path(opts[:save_to])
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
defp prep_filepath(url) do
|
|
29
|
+
url
|
|
30
|
+
|> Linker.offline_url(url)
|
|
31
|
+
|> PathFinder.find_path()
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
defp build_save_path(path, save_to) do
|
|
35
|
+
Path.join(save_to, path)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
defp make_save_path(path, save_to) do
|
|
39
|
+
if File.exists?(save_to) do
|
|
40
|
+
File.mkdir_p(Path.dirname(path))
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
path
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
defmodule Crawler.Snapper.LinkReplacer do
|
|
2
|
+
@moduledoc """
|
|
3
|
+
Replaces links found in a page so they work offline.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
alias Crawler.Linker
|
|
7
|
+
alias Crawler.Parser
|
|
8
|
+
|
|
9
|
+
@doc """
|
|
10
|
+
Replaces links found in a page so they work offline.
|
|
11
|
+
|
|
12
|
+
## Examples
|
|
13
|
+
|
|
14
|
+
iex> LinkReplacer.replace_links(
|
|
15
|
+
iex> "<a href='http://another.domain/page.html'></a>",
|
|
16
|
+
iex> %{
|
|
17
|
+
iex> url: "http://main.domain/dir/page",
|
|
18
|
+
iex> depth: 1,
|
|
19
|
+
iex> max_depths: 2,
|
|
20
|
+
iex> html_tag: "a",
|
|
21
|
+
iex> content_type: "text/html",
|
|
22
|
+
iex> }
|
|
23
|
+
iex> )
|
|
24
|
+
{:ok, "<a href='../../../another.domain/page.html'></a>"}
|
|
25
|
+
|
|
26
|
+
iex> LinkReplacer.replace_links(
|
|
27
|
+
iex> "<a href='http://another.domain/dir/page.html'></a>",
|
|
28
|
+
iex> %{
|
|
29
|
+
iex> url: "http://main.domain/page",
|
|
30
|
+
iex> depth: 1,
|
|
31
|
+
iex> max_depths: 2,
|
|
32
|
+
iex> html_tag: "a",
|
|
33
|
+
iex> content_type: "text/html",
|
|
34
|
+
iex> }
|
|
35
|
+
iex> )
|
|
36
|
+
{:ok, "<a href='../../another.domain/dir/page.html'></a>"}
|
|
37
|
+
|
|
38
|
+
iex> LinkReplacer.replace_links(
|
|
39
|
+
iex> "<a href='http://another.domain/dir/page'></a>",
|
|
40
|
+
iex> %{
|
|
41
|
+
iex> url: "http://main.domain/dir/page",
|
|
42
|
+
iex> depth: 1,
|
|
43
|
+
iex> max_depths: 2,
|
|
44
|
+
iex> html_tag: "a",
|
|
45
|
+
iex> content_type: "text/html",
|
|
46
|
+
iex> }
|
|
47
|
+
iex> )
|
|
48
|
+
{:ok, "<a href='../../../another.domain/dir/page/index.html'></a>"}
|
|
49
|
+
|
|
50
|
+
iex> LinkReplacer.replace_links(
|
|
51
|
+
iex> "<a href='/dir/page2.html'></a>",
|
|
52
|
+
iex> %{
|
|
53
|
+
iex> url: "http://main.domain/dir/page",
|
|
54
|
+
iex> referrer_url: "http://main.domain/dir/page",
|
|
55
|
+
iex> depth: 1,
|
|
56
|
+
iex> max_depths: 2,
|
|
57
|
+
iex> html_tag: "a",
|
|
58
|
+
iex> content_type: "text/html",
|
|
59
|
+
iex> }
|
|
60
|
+
iex> )
|
|
61
|
+
{:ok, "<a href='../../../main.domain/dir/page2.html'></a>"}
|
|
62
|
+
"""
|
|
63
|
+
def replace_links(body, opts) do
|
|
64
|
+
new_body =
|
|
65
|
+
body
|
|
66
|
+
|> Parser.parse_links(opts, &get_link/2)
|
|
67
|
+
|> List.flatten()
|
|
68
|
+
|> Enum.reject(&(&1 == nil))
|
|
69
|
+
|> Enum.reduce(body, &modify_body(opts[:content_type], &2, opts[:url], &1))
|
|
70
|
+
|
|
71
|
+
{:ok, new_body}
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
defp get_link({_, url}, _opts), do: url
|
|
75
|
+
defp get_link({_, link, _, url}, _opts), do: [link, url]
|
|
76
|
+
|
|
77
|
+
defp modify_body(content_type, body, current_url, link) do
|
|
78
|
+
String.replace(
|
|
79
|
+
body,
|
|
80
|
+
regexes(content_type, link),
|
|
81
|
+
modify_link(current_url, link)
|
|
82
|
+
)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
defp regexes(content_type, link) do
|
|
86
|
+
case content_type do
|
|
87
|
+
"text/css" -> ~r{((?!url)\(['"]?)#{link}(['"]?\))}
|
|
88
|
+
_ -> ~r{((?!src|href)=['"])#{link}(['"])}
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
defp modify_link(current_url, link) do
|
|
93
|
+
"\\1" <> Linker.offline_link(current_url, link) <> "\\2"
|
|
94
|
+
end
|
|
95
|
+
end
|