powerdlz23 1.2.3 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Spider/README.md +19 -0
- package/Spider/domain.py +18 -0
- package/Spider/general.py +51 -0
- package/Spider/link_finder.py +25 -0
- package/Spider/main.py +50 -0
- package/Spider/spider.py +74 -0
- package/crawler/.formatter.exs +5 -0
- package/crawler/.github/workflows/ci.yml +29 -0
- package/crawler/.recode.exs +33 -0
- package/crawler/.tool-versions +2 -0
- package/crawler/CHANGELOG.md +82 -0
- package/crawler/README.md +198 -0
- package/crawler/architecture.svg +4 -0
- package/crawler/config/config.exs +9 -0
- package/crawler/config/dev.exs +5 -0
- package/crawler/config/test.exs +5 -0
- package/crawler/examples/google_search/scraper.ex +37 -0
- package/crawler/examples/google_search/url_filter.ex +11 -0
- package/crawler/examples/google_search.ex +77 -0
- package/crawler/lib/crawler/dispatcher/worker.ex +14 -0
- package/crawler/lib/crawler/dispatcher.ex +20 -0
- package/crawler/lib/crawler/fetcher/header_preparer.ex +60 -0
- package/crawler/lib/crawler/fetcher/modifier.ex +45 -0
- package/crawler/lib/crawler/fetcher/policer.ex +77 -0
- package/crawler/lib/crawler/fetcher/recorder.ex +55 -0
- package/crawler/lib/crawler/fetcher/requester.ex +32 -0
- package/crawler/lib/crawler/fetcher/retrier.ex +43 -0
- package/crawler/lib/crawler/fetcher/url_filter.ex +26 -0
- package/crawler/lib/crawler/fetcher.ex +81 -0
- package/crawler/lib/crawler/http.ex +7 -0
- package/crawler/lib/crawler/linker/path_builder.ex +71 -0
- package/crawler/lib/crawler/linker/path_expander.ex +59 -0
- package/crawler/lib/crawler/linker/path_finder.ex +106 -0
- package/crawler/lib/crawler/linker/path_offliner.ex +59 -0
- package/crawler/lib/crawler/linker/path_prefixer.ex +46 -0
- package/crawler/lib/crawler/linker.ex +173 -0
- package/crawler/lib/crawler/options.ex +127 -0
- package/crawler/lib/crawler/parser/css_parser.ex +37 -0
- package/crawler/lib/crawler/parser/guarder.ex +38 -0
- package/crawler/lib/crawler/parser/html_parser.ex +41 -0
- package/crawler/lib/crawler/parser/link_parser/link_expander.ex +32 -0
- package/crawler/lib/crawler/parser/link_parser.ex +50 -0
- package/crawler/lib/crawler/parser.ex +122 -0
- package/crawler/lib/crawler/queue_handler.ex +45 -0
- package/crawler/lib/crawler/scraper.ex +28 -0
- package/crawler/lib/crawler/snapper/dir_maker.ex +45 -0
- package/crawler/lib/crawler/snapper/link_replacer.ex +95 -0
- package/crawler/lib/crawler/snapper.ex +82 -0
- package/crawler/lib/crawler/store/counter.ex +19 -0
- package/crawler/lib/crawler/store/page.ex +7 -0
- package/crawler/lib/crawler/store.ex +87 -0
- package/crawler/lib/crawler/worker.ex +62 -0
- package/crawler/lib/crawler.ex +91 -0
- package/crawler/mix.exs +78 -0
- package/crawler/mix.lock +40 -0
- package/crawler/test/fixtures/introducing-elixir.jpg +0 -0
- package/crawler/test/integration_test.exs +135 -0
- package/crawler/test/lib/crawler/dispatcher/worker_test.exs +7 -0
- package/crawler/test/lib/crawler/dispatcher_test.exs +5 -0
- package/crawler/test/lib/crawler/fetcher/header_preparer_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/policer_test.exs +71 -0
- package/crawler/test/lib/crawler/fetcher/recorder_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/requester_test.exs +9 -0
- package/crawler/test/lib/crawler/fetcher/retrier_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher/url_filter_test.exs +7 -0
- package/crawler/test/lib/crawler/fetcher_test.exs +153 -0
- package/crawler/test/lib/crawler/http_test.exs +47 -0
- package/crawler/test/lib/crawler/linker/path_builder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_finder_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_offliner_test.exs +7 -0
- package/crawler/test/lib/crawler/linker/path_prefixer_test.exs +7 -0
- package/crawler/test/lib/crawler/linker_test.exs +7 -0
- package/crawler/test/lib/crawler/options_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/css_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/guarder_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/html_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser/link_expander_test.exs +7 -0
- package/crawler/test/lib/crawler/parser/link_parser_test.exs +7 -0
- package/crawler/test/lib/crawler/parser_test.exs +8 -0
- package/crawler/test/lib/crawler/queue_handler_test.exs +7 -0
- package/crawler/test/lib/crawler/scraper_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/dir_maker_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper/link_replacer_test.exs +7 -0
- package/crawler/test/lib/crawler/snapper_test.exs +9 -0
- package/crawler/test/lib/crawler/worker_test.exs +5 -0
- package/crawler/test/lib/crawler_test.exs +295 -0
- package/crawler/test/support/test_case.ex +24 -0
- package/crawler/test/support/test_helpers.ex +28 -0
- package/crawler/test/test_helper.exs +7 -0
- package/package.json +1 -1
- package/rubyretriever/.rspec +2 -0
- package/rubyretriever/.travis.yml +7 -0
- package/rubyretriever/Gemfile +3 -0
- package/rubyretriever/Gemfile.lock +64 -0
- package/rubyretriever/LICENSE +20 -0
- package/rubyretriever/Rakefile +7 -0
- package/rubyretriever/bin/rr +79 -0
- package/rubyretriever/lib/retriever/cli.rb +25 -0
- package/rubyretriever/lib/retriever/core_ext.rb +13 -0
- package/rubyretriever/lib/retriever/fetch.rb +268 -0
- package/rubyretriever/lib/retriever/fetchfiles.rb +71 -0
- package/rubyretriever/lib/retriever/fetchseo.rb +18 -0
- package/rubyretriever/lib/retriever/fetchsitemap.rb +43 -0
- package/rubyretriever/lib/retriever/link.rb +47 -0
- package/rubyretriever/lib/retriever/openuri_redirect_patch.rb +8 -0
- package/rubyretriever/lib/retriever/page.rb +104 -0
- package/rubyretriever/lib/retriever/page_iterator.rb +21 -0
- package/rubyretriever/lib/retriever/target.rb +47 -0
- package/rubyretriever/lib/retriever/version.rb +4 -0
- package/rubyretriever/lib/retriever.rb +15 -0
- package/rubyretriever/readme.md +166 -0
- package/rubyretriever/rubyretriever.gemspec +41 -0
- package/rubyretriever/spec/link_spec.rb +77 -0
- package/rubyretriever/spec/page_spec.rb +94 -0
- package/rubyretriever/spec/retriever_spec.rb +84 -0
- package/rubyretriever/spec/spec_helper.rb +17 -0
- package/rubyretriever/spec/target_spec.rb +55 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
RubyRetriever
|
|
2
|
+
==============
|
|
3
|
+
[](http://badge.fury.io/rb/rubyretriever) [](https://travis-ci.org/joenorton/rubyretriever)
|
|
4
|
+
|
|
5
|
+
[RubyRetriever Webpage](https://norton.io/projects/rubyretriever/)
|
|
6
|
+
|
|
7
|
+
By Joe Norton
|
|
8
|
+
|
|
9
|
+
RubyRetriever is a Web Crawler, Scraper & File Harvester. Available as a command-line executable and as a crawling framework.
|
|
10
|
+
|
|
11
|
+
RubyRetriever (RR) uses asynchronous HTTP requests via [Eventmachine](https://github.com/eventmachine/eventmachine) & [Synchrony](https://github.com/igrigorik/em-synchrony) to crawl webpages *very quickly*. RR also uses a Ruby implementation of the [bloomfilter](https://github.com/igrigorik/bloomfilter-rb) in order to keep track of pages it has already crawled in a memory efficient manner.
|
|
12
|
+
|
|
13
|
+
**v1.4.3 Update (3/24/2016)** - Fixes problem with file downloads that had query strings, the filename was being saved with the querystrings still attached. No more.
|
|
14
|
+
|
|
15
|
+
**v1.4.2 Update (3/24/2016)** - Fixes problem with named anchors (divs) being counted as links.
|
|
16
|
+
|
|
17
|
+
**v1.4.1 Update (3/24/2016)** - Update gemfile & external dependency versioning
|
|
18
|
+
|
|
19
|
+
**v1.4.0 Update (3/24/2016)** - Several bug fixes.
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
Mission
|
|
23
|
+
-------
|
|
24
|
+
RubyRetriever aims to be the best command-line crawling and scraping package written in Ruby and a replacement for paid software such as Screaming Frog SEO Spider.
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
Roadmap?
|
|
28
|
+
Not sure. Feel free to offer your thoughts.
|
|
29
|
+
|
|
30
|
+
Some Potential Ideas:
|
|
31
|
+
* 'freeroam mode' - to go on cruising the net endlessly in fileharvest mode
|
|
32
|
+
* 'dead-link finder' mode - collects links returning 404, or other error msgs
|
|
33
|
+
* 'validate robots.txt' mode - outputs the bot-exposed sitemap of your site
|
|
34
|
+
* more sophisticated SEO analysis? replace screaming frog? this would include checks for canonical URL, maybe some keyword density checks, content length checks, etc.
|
|
35
|
+
|
|
36
|
+
Features
|
|
37
|
+
--------
|
|
38
|
+
* Asynchronous HTTP Requests thru EM & Synchrony
|
|
39
|
+
* Bloom filter for tracking visited pages
|
|
40
|
+
* Supports HTTPS
|
|
41
|
+
* Follows 301 redirects (if to same host)
|
|
42
|
+
* 3 CLI modes
|
|
43
|
+
* Sitemap - Find all links on a website, output a valid XML sitemap, or just a CSV
|
|
44
|
+
* File Harvest - find all files linked to on a website, option to autodownload
|
|
45
|
+
* SEO - collect important SEO info from every page, output to a CSV (or STDOUT)
|
|
46
|
+
* Run a Custom Block on a Per-Page basis (PageIterator)
|
|
47
|
+
|
|
48
|
+
Use cases
|
|
49
|
+
---------
|
|
50
|
+
**As an Executable**
|
|
51
|
+
With a single command at the terminal, RR can:
|
|
52
|
+
1. Crawl your website and output a *valid XML sitemap* based on what it found.
|
|
53
|
+
2. Crawl a target website and *download all files of a given filetype*.
|
|
54
|
+
3. Crawl a target website, *collect important SEO information* such as page titles, meta descriptions and h1 tags, and write it to CSV.
|
|
55
|
+
|
|
56
|
+
**Used in Custom scripts**
|
|
57
|
+
As of version 1.3.0, with the PageIterator class you can pass a custom block that will get run against each page during a crawl, and collect the results in an array. This means you can define for yourself whatever it is you want to collect from each page during the crawl.
|
|
58
|
+
|
|
59
|
+
Help & Forks Welcome!
|
|
60
|
+
|
|
61
|
+
Getting started
|
|
62
|
+
-----------
|
|
63
|
+
Install the gem
|
|
64
|
+
```sh
|
|
65
|
+
$ gem install rubyretriever
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
Using the Executable
|
|
70
|
+
--------------------
|
|
71
|
+
**Example: Sitemap mode**
|
|
72
|
+
```sh
|
|
73
|
+
$ rr --sitemap CSV --progress --limit 10 http://www.cnet.com
|
|
74
|
+
```
|
|
75
|
+
OR -- SAME COMMAND
|
|
76
|
+
```sh
|
|
77
|
+
$ rr -s csv -p -l 10 http://www.cnet.com
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
This would map http://www.cnet.com until it crawled a max of 10 pages, then write the results to a CSV named cnet. Optionally, you could also use the format XML and RR would output the same URL list into a valid XML sitemap that could be submitted to Google.
|
|
81
|
+
|
|
82
|
+
**Example: File Harvesting mode**
|
|
83
|
+
```sh
|
|
84
|
+
$ rr --files txt --verbose --limit 1 http://textfiles.com/programming/
|
|
85
|
+
```
|
|
86
|
+
OR -- SAME COMMAND
|
|
87
|
+
```sh
|
|
88
|
+
$ rr -f txt -v -l 1 http://textfiles.com/programming/
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
This would crawl http://textfiles.com/programming/ looking for txt files for only a single page, then write out a list of filepaths to txt files to the terminal. Optionally, you could have the script autodownload all the files by adding the -a/--auto flag.
|
|
92
|
+
|
|
93
|
+
**Example: SEO mode**
|
|
94
|
+
```sh
|
|
95
|
+
$ rr --seo --progress --limit 10 --out cnet-seo http://www.cnet.com
|
|
96
|
+
```
|
|
97
|
+
OR -- SAME COMMAND
|
|
98
|
+
```sh
|
|
99
|
+
$ rr -e -p -l 10 -o cnet-seo http://www.cnet.com
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
This would go to http://www.cnet.com and crawl a max of 10 pages, during which it would collect the SEO fields on those pages - this currently means [url, page title, meta description, h1 text, h2 text]. It would then write the fields to a csv named cnet-seo.
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
command-line arguments
|
|
106
|
+
-----------------------
|
|
107
|
+
Usage: rr [MODE FLAG] [OPTIONS] Target_URL
|
|
108
|
+
|
|
109
|
+
Where MODE FLAG is required, and is either:
|
|
110
|
+
-s, --sitemap FORMAT (only accepts CSV or XML atm)
|
|
111
|
+
-f, --files FILETYPE
|
|
112
|
+
-e, --seo
|
|
113
|
+
|
|
114
|
+
and OPTIONS is the applicable:
|
|
115
|
+
-o, --out FILENAME *Dump fetch data as CSV*
|
|
116
|
+
-p, --progress *Outputs a progressbar*
|
|
117
|
+
-v, --verbose *Output more information*
|
|
118
|
+
-l, --limit PAGE_LIMIT_# *set a max on the total number of crawled pages*
|
|
119
|
+
-h, --help *Display this screen*
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
Using as a Library (starting as of version 1.3.0)
|
|
123
|
+
------------------
|
|
124
|
+
|
|
125
|
+
If you want to collect something, other than that which the executable allows, on a 'per page' basis then you want to use the PageIterator class. Then you can run whatever block you want against each individual page's source code located during the crawl.
|
|
126
|
+
|
|
127
|
+
Sample Script using **PageIterator**
|
|
128
|
+
```ruby
|
|
129
|
+
require 'retriever'
|
|
130
|
+
opts = {
|
|
131
|
+
'maxpages' => 1
|
|
132
|
+
}
|
|
133
|
+
t = Retriever::PageIterator.new('http://www.basecamp.com', opts) do |page|
|
|
134
|
+
[page.url, page.title]
|
|
135
|
+
end
|
|
136
|
+
puts t.result.to_s
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
```sh
|
|
140
|
+
>> [["http://www.basecamp.com", "Basecamp is everyone’s favorite project management app."]]
|
|
141
|
+
```
|
|
142
|
+
Available methods on the page iterator:
|
|
143
|
+
* **#url** - returns full URL of current page
|
|
144
|
+
* **#source** - returns raw page source code
|
|
145
|
+
* **#title** - returns html decoded verson of curent page title
|
|
146
|
+
* **#desc** - returns html decoded verson of curent page meta description
|
|
147
|
+
* **#h1** - returns html decoded verson of current page's h1 tag
|
|
148
|
+
* **#h2** - returns html decoded verson of current page's h2 tag
|
|
149
|
+
* **#links** - returns array of all links on the page
|
|
150
|
+
* **#parse_internal** - returns array of current page's internal (same host) links
|
|
151
|
+
* **#parse_internal_visitable** - returns #parse_internal plus added filtering of only links that are visitable
|
|
152
|
+
* **#parse_seo** - returns array of current page's html decoded title, desc, h1 and h2
|
|
153
|
+
* **#parse_files** - returns array of downloaded files of type supplied as RR options (fileharvest options)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
Current Requirements
|
|
157
|
+
------------
|
|
158
|
+
em-synchrony
|
|
159
|
+
ruby-progressbar
|
|
160
|
+
bloomfilter-rb
|
|
161
|
+
addressable
|
|
162
|
+
htmlentities
|
|
163
|
+
|
|
164
|
+
License
|
|
165
|
+
-------
|
|
166
|
+
See included 'LICENSE' file. It's the MIT license.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'retriever/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |s|
|
|
7
|
+
s.required_ruby_version = ['>= 2.0', '< 2.3']
|
|
8
|
+
s.platform = Gem::Platform::RUBY
|
|
9
|
+
s.version = Retriever::VERSION
|
|
10
|
+
s.name = 'rubyretriever'
|
|
11
|
+
s.date = '2016-04-11'
|
|
12
|
+
s.summary = 'Ruby Web Crawler & File Harvester'
|
|
13
|
+
s.description = 'Asynchronous web crawler, scraper and file harvester'
|
|
14
|
+
s.authors = ['Joe Norton']
|
|
15
|
+
s.email = ['joe@norton.io']
|
|
16
|
+
s.homepage = 'http://norton.io/rubyretriever/'
|
|
17
|
+
s.license = 'MIT'
|
|
18
|
+
# If you need to check in files that aren't .rb files, add them here
|
|
19
|
+
s.files = Dir['{lib}/**/*.rb', 'bin/*', 'LICENSE', '*.md',
|
|
20
|
+
'{spec}/*.rb']
|
|
21
|
+
s.require_path = 'lib'
|
|
22
|
+
s.rubyforge_project = 'rubyretriever'
|
|
23
|
+
|
|
24
|
+
# If you need an executable, add it here
|
|
25
|
+
s.executables = ['rr']
|
|
26
|
+
s.required_rubygems_version = '>= 1.3.6'
|
|
27
|
+
|
|
28
|
+
# If you have other dependencies, add them here
|
|
29
|
+
s.add_runtime_dependency 'em-synchrony'
|
|
30
|
+
s.add_runtime_dependency 'em-http-request'
|
|
31
|
+
s.add_runtime_dependency 'ruby-progressbar'
|
|
32
|
+
s.add_runtime_dependency 'bloomfilter-rb'
|
|
33
|
+
s.add_runtime_dependency 'addressable'
|
|
34
|
+
s.add_runtime_dependency 'htmlentities'
|
|
35
|
+
s.add_runtime_dependency 'nokogiri'
|
|
36
|
+
|
|
37
|
+
s.add_development_dependency 'bundler', '~> 1.6'
|
|
38
|
+
s.add_development_dependency 'rake', '~> 10.3'
|
|
39
|
+
s.add_development_dependency 'rspec', '~> 2.14'
|
|
40
|
+
s.add_development_dependency 'pry'
|
|
41
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
require 'retriever'
|
|
2
|
+
|
|
3
|
+
describe 'Link' do
|
|
4
|
+
|
|
5
|
+
t = Retriever::Target.new('http://www.cnet.com/reviews/')
|
|
6
|
+
let(:links) do
|
|
7
|
+
Retriever::Page.new('http://www.cnet.com/reviews/', @source, t).links
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
it 'collects links in anchor tags' do
|
|
11
|
+
@source = (<<SOURCE).strip
|
|
12
|
+
<a href='http://www.cnet.com/download.exe'>download</a>
|
|
13
|
+
SOURCE
|
|
14
|
+
|
|
15
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'collects links in link tags' do
|
|
19
|
+
@source = (<<SOURCE).strip
|
|
20
|
+
<link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />
|
|
21
|
+
SOURCE
|
|
22
|
+
|
|
23
|
+
expect(links[0]).to include('formreset.css?ver=1.7.12')
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it 'does not collect bare links (ones not in an href)' do
|
|
27
|
+
@source = (<<SOURCE).strip
|
|
28
|
+
http://www.google.com
|
|
29
|
+
SOURCE
|
|
30
|
+
|
|
31
|
+
expect(links).to_not include('http://www.google.com')
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
it 'collects only unique href links on the page' do
|
|
35
|
+
@source = (<<SOURCE).strip
|
|
36
|
+
<a href='http://www.cnet.com/products/gadgets'>gadgets</a>
|
|
37
|
+
<a href='http://www.cnet.com/products/gadgets'>gadgets2</a>
|
|
38
|
+
SOURCE
|
|
39
|
+
|
|
40
|
+
expect(links.size).to eq(1)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it 'adds a protocol to urls missing them (www.)' do
|
|
44
|
+
@source = (<<SOURCE).strip
|
|
45
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
|
46
|
+
SOURCE
|
|
47
|
+
|
|
48
|
+
expect(links).to include('http://www.cnet.com/download.exe')
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it "doesn\'t care about any extra attributes on the anchor tag" do
|
|
52
|
+
@source = (<<SOURCE).strip
|
|
53
|
+
<a href='http://www.cnet.com/products/gadgets/'>gadgets </a>
|
|
54
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
|
55
|
+
</a>
|
|
56
|
+
SOURCE
|
|
57
|
+
|
|
58
|
+
expect(links.size).to eq(1)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
it 'returns relative urls with full path based on hostname' do
|
|
62
|
+
@source = (<<SOURCE).strip
|
|
63
|
+
<a href='/test.html'>test</a>
|
|
64
|
+
<a href='cpage_18'>about</a>
|
|
65
|
+
SOURCE
|
|
66
|
+
|
|
67
|
+
expect(links).to include('http://www.cnet.com/test.html',
|
|
68
|
+
'http://www.cnet.com/reviews/cpage_18')
|
|
69
|
+
end
|
|
70
|
+
it 'collects files even when query strings exist' do
|
|
71
|
+
@source = (<<SOURCE).strip
|
|
72
|
+
<a href='http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&type=audio' type='audio/mpeg; length=22217599' title='Robert Nozick and Murray Rothbard David Gordon.mp3'>Download audio file</a></span></div>
|
|
73
|
+
SOURCE
|
|
74
|
+
|
|
75
|
+
expect(links).to include('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&type=audio')
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
require 'retriever/page'
|
|
2
|
+
require 'retriever/fetch'
|
|
3
|
+
|
|
4
|
+
t = Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
|
5
|
+
|
|
6
|
+
describe 'Page' do
|
|
7
|
+
let(:common_source) do
|
|
8
|
+
<<-SOURCE
|
|
9
|
+
<title>test</title>
|
|
10
|
+
<a href='www.cnet.com/download.exe'>download</a>
|
|
11
|
+
<a href='/test.html'>test</a>
|
|
12
|
+
<a href='http://www.cnet.com/products/gadgets/' data-vanity-rewritten='true'>
|
|
13
|
+
</a>
|
|
14
|
+
<a href='http://www.cnet.com/products/gadgets/' id='gadgets-link'>gadgets </a>
|
|
15
|
+
<a href='http://www.yahoo.com/test/'>yahoo</a>"
|
|
16
|
+
<meta name='description' content="test2 ">
|
|
17
|
+
<h1>test 3</h1>
|
|
18
|
+
<h2> test 4 </h2>
|
|
19
|
+
SOURCE
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
describe '#url' do
|
|
23
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
|
24
|
+
it 'returns current page URL' do
|
|
25
|
+
expect(page.url).to eq('http://www.cnet.com/')
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
describe '#links' do
|
|
30
|
+
let(:source) { "<a href='/profile/'>profile</a><a href='#top'>top</a> <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
|
|
31
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
|
|
32
|
+
it 'collects all unique href links on the page, skips div anchors' do
|
|
33
|
+
expect(page.links.size).to eq(2)
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
describe '#parse_internal' do
|
|
38
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
|
39
|
+
let(:links) { page.parse_internal }
|
|
40
|
+
it 'filters links by host' do
|
|
41
|
+
expect(links.size).to eq(3)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
describe '#parse_internal_visitable' do
|
|
46
|
+
let(:source) { "<a href='/profile/'>profile</a> <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
|
|
47
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
|
|
48
|
+
let(:links) { page.parse_internal_visitable }
|
|
49
|
+
it "filters out 'unvisitable' URLS like JS, Stylesheets, Images" do
|
|
50
|
+
expect(links.size).to eq(1)
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
describe '#parse_files' do
|
|
55
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
|
56
|
+
let(:files) { page.parse_files(page.parse_internal) }
|
|
57
|
+
it 'filters links by filetype' do
|
|
58
|
+
expect(files.size).to eq(1)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
describe '#parse_by_css' do
|
|
63
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
|
64
|
+
|
|
65
|
+
it 'returns the text from the received css selector' do
|
|
66
|
+
expect(page.parse_by_css('#gadgets-link')).to eq('gadgets ')
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
describe '#title' do
|
|
71
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
|
72
|
+
it 'returns page title' do
|
|
73
|
+
expect(page.title).to eq('test')
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
describe '#desc' do
|
|
77
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
|
78
|
+
it 'returns meta description' do
|
|
79
|
+
expect(page.desc).to eq('test2 ')
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
describe '#h1' do
|
|
83
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
|
84
|
+
it 'returns h1 text' do
|
|
85
|
+
expect(page.h1).to eq('test 3')
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
describe '#h2' do
|
|
89
|
+
let(:page) { Retriever::Page.new('http://www.cnet.com/', common_source, t) }
|
|
90
|
+
it 'returns h2 text' do
|
|
91
|
+
expect(page.h2).to eq(' test 4 ')
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
require 'retriever'
|
|
2
|
+
require 'retriever/fetchfiles'
|
|
3
|
+
|
|
4
|
+
describe 'Fetch' do
|
|
5
|
+
let(:r) do
|
|
6
|
+
Retriever::Fetch.new('http://www.yahoo.com', {})
|
|
7
|
+
end
|
|
8
|
+
describe '#good_response?' do
|
|
9
|
+
|
|
10
|
+
let(:resp) do
|
|
11
|
+
{}
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
let(:nil_response) do
|
|
15
|
+
r.good_response?(nil, 'http://www.yahoo.com')
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
let(:unsuccessful_resp) do
|
|
19
|
+
resp.stub(:response_header).and_return(resp)
|
|
20
|
+
resp.stub(:redirection?).and_return(false)
|
|
21
|
+
resp.stub(:successful?).and_return(false)
|
|
22
|
+
resp.stub(:server_error?).and_return(false)
|
|
23
|
+
resp.stub(:client_error?).and_return(false)
|
|
24
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
let(:redir_resp) do
|
|
28
|
+
resp.stub(:response_header).and_return(resp)
|
|
29
|
+
resp.stub(:redirection?).and_return(true)
|
|
30
|
+
resp.stub(:location).and_return('http://www.google.com')
|
|
31
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
let(:bad_content_type_resp) do
|
|
35
|
+
resp.stub(:response_header).and_return(resp)
|
|
36
|
+
resp.stub(:redirection?).and_return(false)
|
|
37
|
+
resp.stub(:successful?).and_return(true)
|
|
38
|
+
resp['CONTENT_TYPE'] = 'image/jpeg'
|
|
39
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
let(:success_resp) do
|
|
43
|
+
resp.stub(:response_header).and_return(resp)
|
|
44
|
+
resp.stub(:redirection?).and_return(false)
|
|
45
|
+
resp.stub(:successful?).and_return(true)
|
|
46
|
+
resp['CONTENT_TYPE'] = 'text/html'
|
|
47
|
+
r.good_response?(resp, 'http://www.yahoo.com')
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
it 'returns false if the response is empty' do
|
|
51
|
+
expect(nil_response).to eq(false)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
it 'returns false on unsuccessful connection' do
|
|
55
|
+
expect(unsuccessful_resp).to eq(false)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
it 'returns false on redirecting host' do
|
|
59
|
+
expect(redir_resp).to eq(false)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
it 'returns false on non-visitable content type' do
|
|
63
|
+
expect(bad_content_type_resp).to eq(false)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
it 'returns true otherwise' do
|
|
67
|
+
expect(success_resp).to eq(true)
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
describe '#filter_out_querystrings' do
|
|
71
|
+
let(:normal_url) do
|
|
72
|
+
r.filter_out_querystrings('http://mises.org/test.mp3')
|
|
73
|
+
end
|
|
74
|
+
let(:query_string_url) do
|
|
75
|
+
r.filter_out_querystrings('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3?file=1&type=audio')
|
|
76
|
+
end
|
|
77
|
+
it 'accepts standard urls' do
|
|
78
|
+
expect(normal_url).to eq('http://mises.org/test.mp3')
|
|
79
|
+
end
|
|
80
|
+
it 'strips query params' do
|
|
81
|
+
expect(query_string_url).to eq('http://mises.org/system/tdf/Robert%20Nozick%20and%20Murray%20Rothbard%20David%20Gordon.mp3')
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
|
4
|
+
# loaded once.
|
|
5
|
+
#
|
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
|
7
|
+
RSpec.configure do |config|
|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
|
9
|
+
config.run_all_when_everything_filtered = true
|
|
10
|
+
config.filter_run :focus
|
|
11
|
+
|
|
12
|
+
# Run specs in random order to surface order dependencies. If you find an
|
|
13
|
+
# order dependency and want to debug it, you can fix the order by providing
|
|
14
|
+
# the seed, which is printed after each run.
|
|
15
|
+
# --seed 1234
|
|
16
|
+
config.order = 'random'
|
|
17
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
require 'retriever'
|
|
2
|
+
require 'open-uri'
|
|
3
|
+
|
|
4
|
+
describe 'Target' do
|
|
5
|
+
let(:t) do
|
|
6
|
+
Retriever::Target.new('http://www.cnet.com/reviews/', /\.exe\z/)
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
it 'creates target var' do
|
|
10
|
+
expect(t.target).to eq('http://www.cnet.com/reviews/')
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it 'creates host var' do
|
|
14
|
+
expect(t.host).to eq('www.cnet.com')
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it 'creates host_re var' do
|
|
18
|
+
expect(t.host_re).to eq(/cnet.com/)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it 'creates port var (no port specified)' do
|
|
22
|
+
expect(t.port).to be_nil
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'creates port var (with port specified)' do
|
|
26
|
+
expect(Retriever::Target.new('http://www.cnet.com:3000/reviews/', /\.exe\z/).port).to be(3000)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'creates file_re var (when provided)' do
|
|
30
|
+
expect(t.file_re).to eq(/\.exe\z/)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it 'adds protocol to Target URL if none given' do
|
|
34
|
+
expect(Retriever::Target.new('cnet.com').target).to eq('http://cnet.com')
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'fails if given URL has no dot in it' do
|
|
38
|
+
expect { Retriever::Target.new('cnetcom') }.to raise_error
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
describe '#source' do
|
|
42
|
+
let(:redirecting_url) do
|
|
43
|
+
Retriever::Target.new('http://software-by-joe.appspot.com').source
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
it 'opens URL and returns source as String' do
|
|
47
|
+
expect(Retriever::Target.new('http://techcrunch.com/').source.class)
|
|
48
|
+
.to eq(String)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
it 'fails if target redirects to new host' do
|
|
52
|
+
expect { redirecting_url }.to raise_error
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|