powerdlz23 1.2.3 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Spider/README.md +19 -0
- package/Spider/domain.py +18 -0
- package/Spider/general.py +51 -0
- package/Spider/link_finder.py +25 -0
- package/Spider/main.py +50 -0
- package/Spider/spider.py +74 -0
- package/crawler/.formatter.exs +5 -0
- package/crawler/.github/workflows/ci.yml +29 -0
- package/crawler/.recode.exs +33 -0
- package/crawler/.tool-versions +2 -0
- package/crawler/CHANGELOG.md +82 -0
- package/crawler/README.md +198 -0
- package/crawler/architecture.svg +4 -0
- package/crawler/config/config.exs +9 -0
- package/crawler/config/dev.exs +5 -0
- package/crawler/config/test.exs +5 -0
- package/crawler/examples/google_search/scraper.ex +37 -0
- package/crawler/examples/google_search/url_filter.ex +11 -0
- package/crawler/examples/google_search.ex +77 -0
- package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
- package/crawler/lib/crawler/dispatcher.ex +20 -0
- package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
- package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
- package/crawler/lib/crawler/fetcher/policer.ex +77 -0
- package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
- package/crawler/lib/crawler/fetcher/requester.ex +32 -0
- package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
- package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
- package/crawler/lib/crawler/fetcher.ex +81 -0
- package/crawler/lib/crawler/http.ex +7 -0
- package/crawler/lib/crawler/linker/path_builder.ex +71 -0
- package/crawler/lib/crawler/linker/path_expander.ex +59 -0
- package/crawler/lib/crawler/linker/path_finder.ex +106 -0
- package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
- package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
- package/crawler/lib/crawler/linker.ex +173 -0
- package/crawler/lib/crawler/options.ex +127 -0
- package/crawler/lib/crawler/parser/css_parser.ex +37 -0
- package/crawler/lib/crawler/parser/guarder.ex +38 -0
- package/crawler/lib/crawler/parser/html_parser.ex +41 -0
- package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
- package/crawler/lib/crawler/parser/link_parser.ex +50 -0
- package/crawler/lib/crawler/parser.ex +122 -0
- package/crawler/lib/crawler/queue_handler.ex +45 -0
- package/crawler/lib/crawler/scraper.ex +28 -0
- package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
- package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
- package/crawler/lib/crawler/snapper.ex +82 -0
- package/crawler/lib/crawler/store/counter.ex +19 -0
- package/crawler/lib/crawler/store/page.ex +7 -0
- package/crawler/lib/crawler/store.ex +87 -0
- package/crawler/lib/crawler/worker.ex +62 -0
- package/crawler/lib/crawler.ex +91 -0
- package/crawler/mix.exs +78 -0
- package/crawler/mix.lock +40 -0
- package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
- package/crawler/test/integration_test.exs +135 -0
- package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
- package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
- package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
- package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
- package/crawler/test/lib/crawler/http_test.exs +47 -0
- package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
- package/crawler/test/lib/crawler/linker_test.exs +7 -0
- package/crawler/test/lib/crawler/options_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser_test.exs +8 -0
- package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
- package/crawler/test/lib/crawler/scraper_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper_test.exs +9 -0
- package/crawler/test/lib/crawler/worker_test.exs +5 -0
- package/crawler/test/lib/crawler_test.exs +295 -0
- package/crawler/test/support/test_case.ex +24 -0
- package/crawler/test/support/test_helpers.ex +28 -0
- package/crawler/test/test_helper.exs +7 -0
- package/grell/.rspec +2 -0
- package/grell/.travis.yml +28 -0
- package/grell/CHANGELOG.md +111 -0
- package/grell/Gemfile +7 -0
- package/grell/LICENSE.txt +22 -0
- package/grell/README.md +213 -0
- package/grell/Rakefile +2 -0
- package/grell/grell.gemspec +36 -0
- package/grell/lib/grell/capybara_driver.rb +44 -0
- package/grell/lib/grell/crawler.rb +83 -0
- package/grell/lib/grell/crawler_manager.rb +84 -0
- package/grell/lib/grell/grell_logger.rb +10 -0
- package/grell/lib/grell/page.rb +275 -0
- package/grell/lib/grell/page_collection.rb +62 -0
- package/grell/lib/grell/rawpage.rb +62 -0
- package/grell/lib/grell/reader.rb +18 -0
- package/grell/lib/grell/version.rb +3 -0
- package/grell/lib/grell.rb +11 -0
- package/grell/spec/lib/capybara_driver_spec.rb +38 -0
- package/grell/spec/lib/crawler_manager_spec.rb +174 -0
- package/grell/spec/lib/crawler_spec.rb +361 -0
- package/grell/spec/lib/page_collection_spec.rb +159 -0
- package/grell/spec/lib/page_spec.rb +418 -0
- package/grell/spec/lib/reader_spec.rb +43 -0
- package/grell/spec/spec_helper.rb +66 -0
- package/heartmagic/config.py +1 -0
- package/heartmagic/heart.py +3 -0
- package/heartmagic/pytransform/__init__.py +483 -0
- package/heartmagic/pytransform/_pytransform.dll +0 -0
- package/heartmagic/pytransform/_pytransform.so +0 -0
- package/httpStatusCode/README.md +2 -0
- package/httpStatusCode/httpStatusCode.js +4 -0
- package/httpStatusCode/reasonPhrases.js +344 -0
- package/httpStatusCode/statusCodes.js +344 -0
- package/package.json +1 -1
- package/rubyretriever/.rspec +2 -0
- package/rubyretriever/.travis.yml +7 -0
- package/rubyretriever/Gemfile +3 -0
- package/rubyretriever/Gemfile.lock +64 -0
- package/rubyretriever/LICENSE +20 -0
- package/rubyretriever/Rakefile +7 -0
- package/rubyretriever/bin/rr +79 -0
- package/rubyretriever/lib/retriever/cli.rb +25 -0
- package/rubyretriever/lib/retriever/core_ext.rb +13 -0
- package/rubyretriever/lib/retriever/fetch.rb +268 -0
- package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
- package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
- package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
- package/rubyretriever/lib/retriever/link.rb +47 -0
- package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
- package/rubyretriever/lib/retriever/page.rb +104 -0
- package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
- package/rubyretriever/lib/retriever/target.rb +47 -0
- package/rubyretriever/lib/retriever/version.rb +4 -0
- package/rubyretriever/lib/retriever.rb +15 -0
- package/rubyretriever/readme.md +166 -0
- package/rubyretriever/rubyretriever.gemspec +41 -0
- package/rubyretriever/spec/link_spec.rb +77 -0
- package/rubyretriever/spec/page_spec.rb +94 -0
- package/rubyretriever/spec/retriever_spec.rb +84 -0
- package/rubyretriever/spec/spec_helper.rb +17 -0
- package/rubyretriever/spec/target_spec.rb +55 -0
- package/snapcrawl/.changelog.old.md +157 -0
- package/snapcrawl/.gitattributes +1 -0
- package/snapcrawl/.github/workflows/test.yml +41 -0
- package/snapcrawl/.rspec +3 -0
- package/snapcrawl/.rubocop.yml +23 -0
- package/snapcrawl/CHANGELOG.md +182 -0
- package/snapcrawl/Gemfile +15 -0
- package/snapcrawl/LICENSE +21 -0
- package/snapcrawl/README.md +135 -0
- package/snapcrawl/Runfile +35 -0
- package/snapcrawl/bin/snapcrawl +25 -0
- package/snapcrawl/lib/snapcrawl/cli.rb +52 -0
- package/snapcrawl/lib/snapcrawl/config.rb +60 -0
- package/snapcrawl/lib/snapcrawl/crawler.rb +98 -0
- package/snapcrawl/lib/snapcrawl/dependencies.rb +21 -0
- package/snapcrawl/lib/snapcrawl/exceptions.rb +5 -0
- package/snapcrawl/lib/snapcrawl/log_helpers.rb +36 -0
- package/snapcrawl/lib/snapcrawl/page.rb +118 -0
- package/snapcrawl/lib/snapcrawl/pretty_logger.rb +11 -0
- package/snapcrawl/lib/snapcrawl/refinements/pair_split.rb +26 -0
- package/snapcrawl/lib/snapcrawl/refinements/string_refinements.rb +13 -0
- package/snapcrawl/lib/snapcrawl/screenshot.rb +73 -0
- package/snapcrawl/lib/snapcrawl/templates/config.yml +49 -0
- package/snapcrawl/lib/snapcrawl/templates/docopt.txt +26 -0
- package/snapcrawl/lib/snapcrawl/version.rb +3 -0
- package/snapcrawl/lib/snapcrawl.rb +20 -0
- package/snapcrawl/snapcrawl.gemspec +27 -0
- package/snapcrawl/snapcrawl.yml +41 -0
- package/snapcrawl/spec/README.md +16 -0
- package/snapcrawl/spec/approvals/bin/help +26 -0
- package/snapcrawl/spec/approvals/bin/usage +4 -0
- package/snapcrawl/spec/approvals/cli/usage +4 -0
- package/snapcrawl/spec/approvals/config/defaults +15 -0
- package/snapcrawl/spec/approvals/config/minimal +15 -0
- package/snapcrawl/spec/approvals/integration/blacklist +14 -0
- package/snapcrawl/spec/approvals/integration/default-config +14 -0
- package/snapcrawl/spec/approvals/integration/depth-0 +6 -0
- package/snapcrawl/spec/approvals/integration/depth-3 +6 -0
- package/snapcrawl/spec/approvals/integration/log-color-no +6 -0
- package/snapcrawl/spec/approvals/integration/screenshot-error +3 -0
- package/snapcrawl/spec/approvals/integration/whitelist +14 -0
- package/snapcrawl/spec/approvals/models/pretty_logger/colors +1 -0
- package/snapcrawl/spec/fixtures/config/minimal.yml +4 -0
- package/snapcrawl/spec/server/config.ru +97 -0
- package/snapcrawl/spec/snapcrawl/bin_spec.rb +15 -0
- package/snapcrawl/spec/snapcrawl/cli_spec.rb +9 -0
- package/snapcrawl/spec/snapcrawl/config_spec.rb +26 -0
- package/snapcrawl/spec/snapcrawl/integration_spec.rb +65 -0
- package/snapcrawl/spec/snapcrawl/page_spec.rb +89 -0
- package/snapcrawl/spec/snapcrawl/pretty_logger_spec.rb +19 -0
- package/snapcrawl/spec/snapcrawl/refinements/pair_split_spec.rb +27 -0
- package/snapcrawl/spec/snapcrawl/refinements/string_refinements_spec.rb +29 -0
- package/snapcrawl/spec/snapcrawl/screenshot_spec.rb +62 -0
- package/snapcrawl/spec/spec_helper.rb +22 -0
- package/snapcrawl/spec/spec_mixin.rb +10 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
|
|
2
|
+
RSpec.describe Grell::Crawler do
|
|
3
|
+
let(:page_id) { rand(10).floor + 10 }
|
|
4
|
+
let(:parent_page_id) { rand(10).floor }
|
|
5
|
+
let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
|
|
6
|
+
let(:host) { 'http://www.example.com' }
|
|
7
|
+
let(:url) { 'http://www.example.com/test' }
|
|
8
|
+
let(:add_match_block) { nil }
|
|
9
|
+
let(:denylist) { /a^/ }
|
|
10
|
+
let(:allowlist) { /.*/ }
|
|
11
|
+
let(:crawler) do
|
|
12
|
+
Grell::Crawler.new(
|
|
13
|
+
logger: Logger.new(nil),
|
|
14
|
+
driver: double(nil),
|
|
15
|
+
evaluate_in_each_page: script,
|
|
16
|
+
add_match_block: add_match_block,
|
|
17
|
+
denylist: denylist,
|
|
18
|
+
allowlist: allowlist)
|
|
19
|
+
end
|
|
20
|
+
let(:script) { nil }
|
|
21
|
+
let(:body) { 'body' }
|
|
22
|
+
let(:custom_add_match) do
|
|
23
|
+
Proc.new do |collection_page, page|
|
|
24
|
+
collection_page.path == page.path
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
before do
|
|
29
|
+
proxy.stub(url).and_return(body: body, code: 200)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe '#crawl' do
|
|
33
|
+
before do
|
|
34
|
+
crawler.instance_variable_set('@collection', Grell::PageCollection.new(custom_add_match))
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'yields the result if a block is given' do
|
|
38
|
+
result = []
|
|
39
|
+
block = Proc.new { |n| result.push(n) }
|
|
40
|
+
crawler.crawl(page, block)
|
|
41
|
+
expect(result.size).to eq(1)
|
|
42
|
+
expect(result.first.url).to eq(url)
|
|
43
|
+
expect(result.first.visited?).to eq(true)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it 'rescues any specified exceptions raised during the block execution' do
|
|
47
|
+
block = Proc.new { |n| raise Capybara::Poltergeist::BrowserError, 'Exception' }
|
|
48
|
+
expect{ crawler.crawl(page, block) }.to_not raise_error
|
|
49
|
+
expect(page.status).to eq(404)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
it 'logs interesting information' do
|
|
53
|
+
crawler
|
|
54
|
+
expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
|
|
55
|
+
crawler.crawl(page, nil)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it 'retries when the block returns :retry' do
|
|
59
|
+
counter = 0
|
|
60
|
+
times_retrying = 2
|
|
61
|
+
block = Proc.new do |n|
|
|
62
|
+
if counter < times_retrying
|
|
63
|
+
counter += 1
|
|
64
|
+
:retry
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
crawler.crawl(page, block)
|
|
68
|
+
expect(counter).to eq(times_retrying)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
it 'handles redirects by adding the current_url to the page collection' do
|
|
72
|
+
redirect_url = 'http://www.example.com/test/landing_page'
|
|
73
|
+
allow(page).to receive(:current_url).and_return(redirect_url)
|
|
74
|
+
expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
|
|
75
|
+
crawler.crawl(page, nil)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
context 'without script' do
|
|
79
|
+
it 'does not evaluate a script' do
|
|
80
|
+
expect_any_instance_of(Capybara::Session).not_to receive(:evaluate_script)
|
|
81
|
+
crawler.crawl(page, nil)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
context 'with script' do
|
|
86
|
+
let(:script) { "(typeof(jQuery)!='undefined') && $('.dropdown').addClass('open');" }
|
|
87
|
+
it 'evaluates a script' do
|
|
88
|
+
expect_any_instance_of(Capybara::Session).to receive(:evaluate_script).with(script)
|
|
89
|
+
crawler.crawl(page, nil)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
context '#start_crawling' do
|
|
95
|
+
let(:body) do
|
|
96
|
+
<<-EOS
|
|
97
|
+
<html><head></head><body>
|
|
98
|
+
<a href="/musmis.html">trusmis</a>
|
|
99
|
+
Hello world!
|
|
100
|
+
</body></html>
|
|
101
|
+
EOS
|
|
102
|
+
end
|
|
103
|
+
let(:url_visited) { "http://www.example.com/musmis.html" }
|
|
104
|
+
|
|
105
|
+
before do
|
|
106
|
+
proxy.stub(url_visited).and_return(body: 'body', code: 200)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
it 'calls the block we used to start_crawling' do
|
|
110
|
+
result = []
|
|
111
|
+
block = Proc.new { |n| result.push(n) }
|
|
112
|
+
crawler.start_crawling(url, &block)
|
|
113
|
+
expect(result.size).to eq(2)
|
|
114
|
+
expect(result[0].url).to eq(url)
|
|
115
|
+
expect(result[1].url).to eq(url_visited)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
shared_examples_for 'visits all available pages' do
|
|
121
|
+
it 'visits all the pages' do
|
|
122
|
+
crawler.start_crawling(url)
|
|
123
|
+
expect(crawler.collection.visited_pages.size).to eq(visited_pages_count)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
it 'has no more pages to discover' do
|
|
127
|
+
crawler.start_crawling(url)
|
|
128
|
+
expect(crawler.collection.discovered_pages.size).to eq(0)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it 'contains the allowlisted page and the base page only' do
|
|
132
|
+
crawler.start_crawling(url)
|
|
133
|
+
expect(crawler.collection.visited_pages.map(&:url)).
|
|
134
|
+
to eq(visited_pages)
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
context 'the url has no links' do
|
|
139
|
+
let(:body) do
|
|
140
|
+
"<html><head></head><body>
|
|
141
|
+
Hello world!
|
|
142
|
+
</body></html>"
|
|
143
|
+
end
|
|
144
|
+
let(:visited_pages_count) { 1 }
|
|
145
|
+
let(:visited_pages) { ['http://www.example.com/test'] }
|
|
146
|
+
|
|
147
|
+
it_behaves_like 'visits all available pages'
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
context 'the url has several links' do
|
|
151
|
+
let(:visited_pages_count) { 3 }
|
|
152
|
+
let(:visited_pages) do
|
|
153
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
|
154
|
+
end
|
|
155
|
+
let(:body) do
|
|
156
|
+
"<html><head></head><body>
|
|
157
|
+
<a href=\"/trusmis.html\">trusmis</a>
|
|
158
|
+
<a href=\"/help.html\">help</a>
|
|
159
|
+
Hello world!
|
|
160
|
+
</body></html>"
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
before do
|
|
164
|
+
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
|
165
|
+
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
it_behaves_like 'visits all available pages'
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
describe '#allowlist' do
|
|
172
|
+
let(:body) do
|
|
173
|
+
"<html><head></head><body>
|
|
174
|
+
<a href=\"/trusmis.html\">trusmis</a>
|
|
175
|
+
<a href=\"/help.html\">help</a>
|
|
176
|
+
Hello world!
|
|
177
|
+
</body></html>"
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
before do
|
|
181
|
+
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
|
182
|
+
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
context 'using a single string' do
|
|
186
|
+
let(:allowlist) { '/trusmis.html' }
|
|
187
|
+
let(:visited_pages_count) { 2 } # my own page + trusmis
|
|
188
|
+
let(:visited_pages) do
|
|
189
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it_behaves_like 'visits all available pages'
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
context 'using an array of strings' do
|
|
196
|
+
let(:allowlist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
|
197
|
+
let(:visited_pages_count) { 2 }
|
|
198
|
+
let(:visited_pages) do
|
|
199
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
it_behaves_like 'visits all available pages'
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
context 'using a regexp' do
|
|
206
|
+
let(:allowlist) { /\/trusmis\.html/ }
|
|
207
|
+
let(:visited_pages_count) { 2 }
|
|
208
|
+
let(:visited_pages) do
|
|
209
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
it_behaves_like 'visits all available pages'
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
context 'using an array of regexps' do
|
|
216
|
+
let(:allowlist) { [/\/trusmis\.html/] }
|
|
217
|
+
let(:visited_pages_count) { 2 }
|
|
218
|
+
let(:visited_pages) do
|
|
219
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
it_behaves_like 'visits all available pages'
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
context 'using an empty array' do
|
|
226
|
+
let(:allowlist) { [] }
|
|
227
|
+
let(:visited_pages_count) { 1 } # my own page only
|
|
228
|
+
let(:visited_pages) do
|
|
229
|
+
['http://www.example.com/test']
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
it_behaves_like 'visits all available pages'
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
context 'adding all links to the allowlist' do
|
|
236
|
+
let(:allowlist) { ['/trusmis', '/help'] }
|
|
237
|
+
let(:visited_pages_count) { 3 } # all links
|
|
238
|
+
let(:visited_pages) do
|
|
239
|
+
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
it_behaves_like 'visits all available pages'
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
describe '#denylist' do
|
|
248
|
+
let(:body) do
|
|
249
|
+
"<html><head></head><body>
|
|
250
|
+
<a href=\"/trusmis.html\">trusmis</a>
|
|
251
|
+
<a href=\"/help.html\">help</a>
|
|
252
|
+
Hello world!
|
|
253
|
+
</body></html>"
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
before do
|
|
257
|
+
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
|
258
|
+
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
context 'using a single string' do
|
|
262
|
+
let(:denylist) { '/trusmis.html' }
|
|
263
|
+
let(:visited_pages_count) {2}
|
|
264
|
+
let(:visited_pages) do
|
|
265
|
+
['http://www.example.com/test','http://www.example.com/help.html']
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
it_behaves_like 'visits all available pages'
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
context 'using an array of strings' do
|
|
272
|
+
let(:denylist) { ['/trusmis.html', '/nothere', 'another.html'] }
|
|
273
|
+
let(:visited_pages_count) {2}
|
|
274
|
+
let(:visited_pages) do
|
|
275
|
+
['http://www.example.com/test','http://www.example.com/help.html']
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
it_behaves_like 'visits all available pages'
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
context 'using a regexp' do
|
|
282
|
+
let(:denylist) { /\/trusmis\.html/ }
|
|
283
|
+
let(:visited_pages_count) {2}
|
|
284
|
+
let(:visited_pages) do
|
|
285
|
+
['http://www.example.com/test','http://www.example.com/help.html']
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
it_behaves_like 'visits all available pages'
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
context 'using an array of regexps' do
|
|
292
|
+
let(:denylist) { [/\/trusmis\.html/] }
|
|
293
|
+
let(:visited_pages_count) {2}
|
|
294
|
+
let(:visited_pages) do
|
|
295
|
+
['http://www.example.com/test','http://www.example.com/help.html']
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
it_behaves_like 'visits all available pages'
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
context 'using an empty array' do
|
|
302
|
+
let(:denylist) { [] }
|
|
303
|
+
let(:visited_pages_count) { 3 } # all links
|
|
304
|
+
let(:visited_pages) do
|
|
305
|
+
['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
it_behaves_like 'visits all available pages'
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
context 'adding all links to the denylist' do
|
|
312
|
+
let(:denylist) { ['/trusmis', '/help'] }
|
|
313
|
+
let(:visited_pages_count) { 1 }
|
|
314
|
+
let(:visited_pages) do
|
|
315
|
+
['http://www.example.com/test']
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
it_behaves_like 'visits all available pages'
|
|
319
|
+
end
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
describe 'allowlisting and denylisting' do
|
|
324
|
+
let(:body) do
|
|
325
|
+
"<html><head></head><body>
|
|
326
|
+
<a href=\"/trusmis.html\">trusmis</a>
|
|
327
|
+
<a href=\"/help.html\">help</a>
|
|
328
|
+
Hello world!
|
|
329
|
+
</body></html>"
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
before do
|
|
333
|
+
proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
|
|
334
|
+
proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
|
|
335
|
+
end
|
|
336
|
+
|
|
337
|
+
context 'we denylist the only allowlisted page' do
|
|
338
|
+
let(:allowlist) { '/trusmis.html' }
|
|
339
|
+
let(:denylist) { '/trusmis.html' }
|
|
340
|
+
let(:visited_pages_count) { 1 }
|
|
341
|
+
let(:visited_pages) do
|
|
342
|
+
['http://www.example.com/test']
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
it_behaves_like 'visits all available pages'
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
context 'we denylist none of the allowlisted pages' do
|
|
349
|
+
let(:allowlist) { '/trusmis.html' }
|
|
350
|
+
let(:denylist) { '/raistlin.html' }
|
|
351
|
+
let(:visited_pages_count) { 2 }
|
|
352
|
+
let(:visited_pages) do
|
|
353
|
+
['http://www.example.com/test', 'http://www.example.com/trusmis.html']
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
it_behaves_like 'visits all available pages'
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
end
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
|
|
2
|
+
RSpec.describe Grell::PageCollection do
|
|
3
|
+
let(:add_match_block) do
|
|
4
|
+
Proc.new do |collection_page, page|
|
|
5
|
+
collection_page.url.downcase == page.url.downcase
|
|
6
|
+
end
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
let(:collection) { Grell::PageCollection.new(add_match_block) }
|
|
10
|
+
let(:url) { 'http://www.github.com/SomeUser/dragonlance?search=false' }
|
|
11
|
+
let(:url2) { 'http://www.github.com/OtherUser/forgotten?search=false' }
|
|
12
|
+
|
|
13
|
+
context 'empty collection' do
|
|
14
|
+
|
|
15
|
+
it 'has no visited pages' do
|
|
16
|
+
expect(collection.visited_pages).to be_empty
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
it 'has no discovered pages' do
|
|
20
|
+
expect(collection.discovered_pages).to be_empty
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it 'next page is nil' do
|
|
24
|
+
expect(collection.next_page).to be_nil
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
context 'one unvisited page' do
|
|
29
|
+
let(:page) { collection.create_page(url, 0) }
|
|
30
|
+
|
|
31
|
+
before do
|
|
32
|
+
allow(page).to receive(:visited?).and_return(false)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
it 'has no visited pages' do
|
|
36
|
+
expect(collection.visited_pages).to be_empty
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'has one discovered page' do
|
|
40
|
+
expect(collection.discovered_pages).to eq([page])
|
|
41
|
+
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it 'next page is the unvisited page' do
|
|
45
|
+
expect(collection.next_page).to eq(page)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
context 'one visited page' do
|
|
50
|
+
let(:page) { collection.create_page(url, 0) }
|
|
51
|
+
|
|
52
|
+
before do
|
|
53
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
it 'has one visited page' do
|
|
57
|
+
expect(collection.visited_pages).to eq([page])
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'has no discovered pages' do
|
|
61
|
+
expect(collection.discovered_pages).to be_empty
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it 'next page is nil' do
|
|
65
|
+
expect(collection.next_page).to be_nil
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
context 'one visited and one unvisited page with the same url' do
|
|
70
|
+
let(:page) { collection.create_page(url, 0) }
|
|
71
|
+
let(:unvisited) { collection.create_page(url.upcase, 0) }
|
|
72
|
+
|
|
73
|
+
before do
|
|
74
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
75
|
+
allow(unvisited).to receive(:visited?).and_return(false)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
it 'first page has id 0' do
|
|
79
|
+
expect(page.id).to eq(0)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it 'second page has id 1' do
|
|
83
|
+
expect(unvisited.id).to eq(1)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
it 'has one visited page' do
|
|
87
|
+
expect(collection.visited_pages).to eq([page])
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
it 'has no discovered pages' do
|
|
91
|
+
expect(collection.discovered_pages).to be_empty
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
it 'next page is nil' do
|
|
95
|
+
expect(collection.next_page).to be_nil
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
context 'one visited and one unvisited page with different URLs' do
|
|
100
|
+
let(:page) { collection.create_page(url, 0) }
|
|
101
|
+
let(:unvisited) { collection.create_page(url2, 0) }
|
|
102
|
+
|
|
103
|
+
before do
|
|
104
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
105
|
+
allow(unvisited).to receive(:visited?).and_return(false)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
it 'has one visited page' do
|
|
109
|
+
expect(collection.visited_pages).to eq([page])
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it 'has one discovered page' do
|
|
113
|
+
expect(collection.discovered_pages).to eq([unvisited])
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
it 'next page is the unvisited page' do
|
|
117
|
+
expect(collection.next_page).to eq(unvisited)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
context 'one visited and one unvisited page with different URLs only different by the query' do
|
|
122
|
+
let(:page) { collection.create_page(url, 0) }
|
|
123
|
+
let(:url3) { 'http://www.github.com/SomeUser/dragonlance?search=true' }
|
|
124
|
+
let(:unvisited) { collection.create_page(url3, 0) }
|
|
125
|
+
|
|
126
|
+
before do
|
|
127
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
128
|
+
allow(unvisited).to receive(:visited?).and_return(false)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it 'has one visited page' do
|
|
132
|
+
expect(collection.visited_pages).to eq([page])
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
it 'has one discovered page' do
|
|
136
|
+
expect(collection.discovered_pages).to eq([unvisited])
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
it 'next page is the unvisited page' do
|
|
140
|
+
expect(collection.next_page).to eq(unvisited)
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
context 'several unvisited pages' do
|
|
145
|
+
let(:page) { collection.create_page(url, 2) }
|
|
146
|
+
let(:page2) { collection.create_page(url2, 0) }
|
|
147
|
+
|
|
148
|
+
before do
|
|
149
|
+
allow(page).to receive(:visited?).and_return(true)
|
|
150
|
+
allow(page2).to receive(:visited?).and_return(false)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it 'returns the page which has an earlier parent' do
|
|
154
|
+
expect(collection.next_page).to eq(page2)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
end
|