trawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +52 -0
- data/Rakefile +1 -0
- data/lib/trawler/document.rb +22 -0
- data/lib/trawler/parsed_document.rb +30 -0
- data/lib/trawler/parser.rb +94 -0
- data/lib/trawler/spider.rb +24 -0
- data/lib/trawler/version.rb +3 -0
- data/lib/trawler.rb +11 -0
- data/spec/document_spec.rb +31 -0
- data/spec/fixtures/sample_pages/simple.html +10 -0
- data/spec/fixtures/sample_pages/tumblr.html +840 -0
- data/spec/fixtures/sample_pages/youtube.html +1404 -0
- data/spec/fixtures/vcr_cassettes/trawl_page.yml +57 -0
- data/spec/parsed_document_spec.rb +32 -0
- data/spec/parser_spec.rb +100 -0
- data/spec/spec_helper.rb +26 -0
- data/spec/spider_spec.rb +52 -0
- data/spec/trawler_spec.rb +30 -0
- data/trawler.gemspec +30 -0
- metadata +189 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://www.cats.com/
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept-Encoding:
|
11
|
+
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
|
12
|
+
Accept:
|
13
|
+
- '*/*'
|
14
|
+
User-Agent:
|
15
|
+
- Ruby
|
16
|
+
response:
|
17
|
+
status:
|
18
|
+
code: 200
|
19
|
+
message: OK
|
20
|
+
headers:
|
21
|
+
Server:
|
22
|
+
- Apache
|
23
|
+
Set-Cookie:
|
24
|
+
- COOKIE=10.5.19.71.1382123959465997; path=/
|
25
|
+
- referrer=; path=/
|
26
|
+
- referrer=www.cats.com; path=/
|
27
|
+
- t=2fe697b0382a11e3a9020015c5e70714; path=/
|
28
|
+
Etag:
|
29
|
+
- '"AAAAUHDDtBw"'
|
30
|
+
Last-Modified:
|
31
|
+
- Wed, 16 Oct 2013 20:57:10 GMT
|
32
|
+
Vary:
|
33
|
+
- Accept-Encoding,User-Agent
|
34
|
+
Cartoon:
|
35
|
+
- aalander3
|
36
|
+
Content-Type:
|
37
|
+
- text/html; charset=UTF-8
|
38
|
+
Content-Length:
|
39
|
+
- '39'
|
40
|
+
Accept-Ranges:
|
41
|
+
- bytes
|
42
|
+
Date:
|
43
|
+
- Fri, 18 Oct 2013 19:19:19 GMT
|
44
|
+
X-Varnish:
|
45
|
+
- '2126042946'
|
46
|
+
Age:
|
47
|
+
- '0'
|
48
|
+
Via:
|
49
|
+
- 1.1 varnish
|
50
|
+
Connection:
|
51
|
+
- keep-alive
|
52
|
+
body:
|
53
|
+
encoding: UTF-8
|
54
|
+
string: "<html>\r\n<body>\r\n</body>\r\n</html>\r\n"
|
55
|
+
http_version:
|
56
|
+
recorded_at: Fri, 18 Oct 2013 19:19:14 GMT
|
57
|
+
recorded_with: VCR 2.6.0
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Trawler::ParsedDocument do
|
4
|
+
|
5
|
+
let(:data) { double("PARSER", images: ["foo"], title: "foobar", video: "bar", description: "foobarbaz", document: "") }
|
6
|
+
let(:doc) { Trawler::ParsedDocument.new("www.cats.com", data) }
|
7
|
+
|
8
|
+
it "sets the url" do
|
9
|
+
expect(doc.url).to eq "www.cats.com"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "has images" do
|
13
|
+
expect(doc.images).to be_a Array
|
14
|
+
expect(doc.images).to include "foo"
|
15
|
+
end
|
16
|
+
|
17
|
+
it "has a title" do
|
18
|
+
expect(doc.title).to eq "foobar"
|
19
|
+
end
|
20
|
+
|
21
|
+
it "has a video" do
|
22
|
+
expect(doc.video).to eq "bar"
|
23
|
+
end
|
24
|
+
|
25
|
+
it "has a description" do
|
26
|
+
expect(doc.description).to eq "foobarbaz"
|
27
|
+
end
|
28
|
+
|
29
|
+
it "has the raw data" do
|
30
|
+
expect(doc.raw_data).to eq ""
|
31
|
+
end
|
32
|
+
end
|
data/spec/parser_spec.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Trawler::Parser do
|
4
|
+
context "image rich page" do
|
5
|
+
let(:page) { fixture("sample_pages/tumblr.html") }
|
6
|
+
let(:parser) do
|
7
|
+
Trawler::Parser.new(
|
8
|
+
page: page,
|
9
|
+
url: "http://www.foo.com",
|
10
|
+
image_size: "100"
|
11
|
+
)
|
12
|
+
end
|
13
|
+
|
14
|
+
describe "#title" do
|
15
|
+
it "parses the page title" do
|
16
|
+
expect(parser.title).to eq "Dogshaming"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "#description" do
|
21
|
+
it "parses the description from the meta data" do
|
22
|
+
expect(parser.description).not_to be_empty
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "#videos" do
|
27
|
+
it "returns nil if no videos are found" do
|
28
|
+
expect(parser.video).to be_nil
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe "#images" do
|
33
|
+
it "returns an array of images" do
|
34
|
+
expect(parser.images).to be_a Array
|
35
|
+
end
|
36
|
+
|
37
|
+
it "all the images that are larger than the min image size" do
|
38
|
+
expect(parser.images.size).to eq 10
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
context "video page" do
|
44
|
+
let(:page) { fixture("sample_pages/youtube.html") }
|
45
|
+
let(:parser) do
|
46
|
+
Trawler::Parser.new(
|
47
|
+
page: page,
|
48
|
+
url: "http://www.foo.com",
|
49
|
+
image_size: "100"
|
50
|
+
)
|
51
|
+
end
|
52
|
+
|
53
|
+
it "gets the title" do
|
54
|
+
expect(parser.title).to eq "Single Page Web Applications: JavaScript End-to-End (The Hard Stuff)"
|
55
|
+
end
|
56
|
+
|
57
|
+
it "gets the video" do
|
58
|
+
expect(parser.video).to eq "http://www.youtube.com/v/OrIFaWJ9Glo?version=3&autohide=1"
|
59
|
+
end
|
60
|
+
|
61
|
+
it "gets the description" do
|
62
|
+
expect(parser.description).to eq "In the old days, when websites were steam powered and exploded regularly, the web was simple, but slow. As it evolved it became more powerful, but harder on ..."
|
63
|
+
end
|
64
|
+
|
65
|
+
it "gets the images" do
|
66
|
+
expect(parser.images).to include "http://i1.ytimg.com/vi/OrIFaWJ9Glo/maxresdefault.jpg"
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
context "document style page" do
|
71
|
+
let(:page) { fixture("sample_pages/simple.html") }
|
72
|
+
let(:parser) do
|
73
|
+
Trawler::Parser.new(
|
74
|
+
page: page,
|
75
|
+
url: "foobar",
|
76
|
+
image_size: "100"
|
77
|
+
)
|
78
|
+
end
|
79
|
+
|
80
|
+
let(:description) do
|
81
|
+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed dapibus velit in lacus mollis vehicula nec a arcu"
|
82
|
+
end
|
83
|
+
|
84
|
+
it "returns the description from the body of the text" do
|
85
|
+
expect(parser.description[0..120]).to match description
|
86
|
+
end
|
87
|
+
|
88
|
+
it "gets the title from the html" do
|
89
|
+
expect(parser.title).to eq "Super simple html page"
|
90
|
+
end
|
91
|
+
|
92
|
+
it "gets no images" do
|
93
|
+
expect(parser.images).to be_empty
|
94
|
+
end
|
95
|
+
|
96
|
+
it "gets no video" do
|
97
|
+
expect(parser.video).to be_nil
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'webmock/rspec'
|
2
|
+
require "simplecov"
|
3
|
+
require "json"
|
4
|
+
require "vcr"
|
5
|
+
|
6
|
+
SimpleCov.start do
|
7
|
+
add_filter "/spec/"
|
8
|
+
end
|
9
|
+
|
10
|
+
VCR.configure do |c|
|
11
|
+
c.cassette_library_dir = 'spec/fixtures/vcr_cassettes'
|
12
|
+
c.hook_into :webmock
|
13
|
+
end
|
14
|
+
|
15
|
+
$LOAD_PATH.unshift(File.expand_path("../..", __FILE__))
|
16
|
+
|
17
|
+
require 'lib/trawler'
|
18
|
+
|
19
|
+
def fixture_path(filename=nil)
|
20
|
+
path = File.expand_path("../fixtures", __FILE__)
|
21
|
+
filename.nil? ? path : File.join(path, filename)
|
22
|
+
end
|
23
|
+
|
24
|
+
def fixture(file)
|
25
|
+
File.read(File.join(fixture_path, file))
|
26
|
+
end
|
data/spec/spider_spec.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Trawler::Spider do
|
4
|
+
let(:spider) { Trawler::Spider.new("www.cats.com") }
|
5
|
+
|
6
|
+
it "assigns the url" do
|
7
|
+
expect(spider.instance_variable_get("@url")).to eq "www.cats.com"
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "#full_url" do
|
11
|
+
context "without a full scheme" do
|
12
|
+
it "adds the protocol to the url" do
|
13
|
+
expect(spider.full_url).to eq "http://www.cats.com"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "with a full scheme" do
|
18
|
+
let(:spider) { Trawler::Spider.new("https://foo.com") }
|
19
|
+
it "returns the url" do
|
20
|
+
expect(spider.full_url).to eq "https://foo.com"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#get_page" do
|
26
|
+
it "returns a string" do
|
27
|
+
VCR.use_cassette("trawl_page") do
|
28
|
+
expect(spider.get_page).to be_a StringIO
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe "#call" do
|
34
|
+
let(:crawled_spider) do
|
35
|
+
VCR.use_cassette("trawl_page") do
|
36
|
+
spider.call
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
it "returns the object" do
|
41
|
+
expect(crawled_spider).to be_a Trawler::Spider
|
42
|
+
end
|
43
|
+
|
44
|
+
it "has the full_url" do
|
45
|
+
expect(crawled_spider.full_url).to eq "http://www.cats.com"
|
46
|
+
end
|
47
|
+
|
48
|
+
it "has the page" do
|
49
|
+
expect(crawled_spider.page).not_to be_nil
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe Trawler do
|
4
|
+
describe "#fetch" do
|
5
|
+
|
6
|
+
before do
|
7
|
+
Trawler::Spider.any_instance.stub(:get_page) { fixture("sample_pages/tumblr.html") }
|
8
|
+
end
|
9
|
+
|
10
|
+
let(:haul) { Trawler.fetch("http://www.dogshaming.com/") }
|
11
|
+
|
12
|
+
it "returns a ParsedDocument" do
|
13
|
+
expect(haul).to be_a(Trawler::ParsedDocument)
|
14
|
+
end
|
15
|
+
|
16
|
+
it "has the original url" do
|
17
|
+
expect(haul.url).to eq("http://www.dogshaming.com/")
|
18
|
+
end
|
19
|
+
|
20
|
+
[:video, :images, :description, :title].each do |attr|
|
21
|
+
it "has #{attr}" do
|
22
|
+
expect(haul).to respond_to(attr)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
it "returns an array of images" do
|
27
|
+
expect(haul.images).to be_a Array
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
data/trawler.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'trawler/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "trawler"
|
8
|
+
spec.version = Trawler::VERSION
|
9
|
+
spec.authors = ["Jon Wheeler"]
|
10
|
+
spec.email = ["jon@doejo.com"]
|
11
|
+
spec.description = %q{Scrapes the web}
|
12
|
+
spec.summary = %q{Scrape a url for images, video links and meta descriptions}
|
13
|
+
spec.homepage = %q{https://github.com/Jonwheeler/trawler}
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "nokogiri", "~> 1.6"
|
22
|
+
spec.add_dependency "hashr", "~> 0.0.22"
|
23
|
+
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
25
|
+
spec.add_development_dependency "rake"
|
26
|
+
spec.add_development_dependency "rspec", "~> 2.14"
|
27
|
+
spec.add_development_dependency "webmock", "~> 1.14"
|
28
|
+
spec.add_development_dependency "vcr", "~> 2.6"
|
29
|
+
spec.add_development_dependency "simplecov", "~> 0.7"
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,189 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: trawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jon Wheeler
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-10-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: hashr
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.0.22
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.0.22
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.3'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.3'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '2.14'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ~>
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '2.14'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: webmock
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ~>
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.14'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ~>
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.14'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: vcr
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ~>
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '2.6'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ~>
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '2.6'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: simplecov
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ~>
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0.7'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ~>
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0.7'
|
125
|
+
description: Scrapes the web
|
126
|
+
email:
|
127
|
+
- jon@doejo.com
|
128
|
+
executables: []
|
129
|
+
extensions: []
|
130
|
+
extra_rdoc_files: []
|
131
|
+
files:
|
132
|
+
- .gitignore
|
133
|
+
- .rspec
|
134
|
+
- Gemfile
|
135
|
+
- LICENSE.txt
|
136
|
+
- README.md
|
137
|
+
- Rakefile
|
138
|
+
- lib/trawler.rb
|
139
|
+
- lib/trawler/document.rb
|
140
|
+
- lib/trawler/parsed_document.rb
|
141
|
+
- lib/trawler/parser.rb
|
142
|
+
- lib/trawler/spider.rb
|
143
|
+
- lib/trawler/version.rb
|
144
|
+
- spec/document_spec.rb
|
145
|
+
- spec/fixtures/sample_pages/simple.html
|
146
|
+
- spec/fixtures/sample_pages/tumblr.html
|
147
|
+
- spec/fixtures/sample_pages/youtube.html
|
148
|
+
- spec/fixtures/vcr_cassettes/trawl_page.yml
|
149
|
+
- spec/parsed_document_spec.rb
|
150
|
+
- spec/parser_spec.rb
|
151
|
+
- spec/spec_helper.rb
|
152
|
+
- spec/spider_spec.rb
|
153
|
+
- spec/trawler_spec.rb
|
154
|
+
- trawler.gemspec
|
155
|
+
homepage: https://github.com/Jonwheeler/trawler
|
156
|
+
licenses:
|
157
|
+
- MIT
|
158
|
+
metadata: {}
|
159
|
+
post_install_message:
|
160
|
+
rdoc_options: []
|
161
|
+
require_paths:
|
162
|
+
- lib
|
163
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
164
|
+
requirements:
|
165
|
+
- - '>='
|
166
|
+
- !ruby/object:Gem::Version
|
167
|
+
version: '0'
|
168
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - '>='
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0'
|
173
|
+
requirements: []
|
174
|
+
rubyforge_project:
|
175
|
+
rubygems_version: 2.1.9
|
176
|
+
signing_key:
|
177
|
+
specification_version: 4
|
178
|
+
summary: Scrape a url for images, video links and meta descriptions
|
179
|
+
test_files:
|
180
|
+
- spec/document_spec.rb
|
181
|
+
- spec/fixtures/sample_pages/simple.html
|
182
|
+
- spec/fixtures/sample_pages/tumblr.html
|
183
|
+
- spec/fixtures/sample_pages/youtube.html
|
184
|
+
- spec/fixtures/vcr_cassettes/trawl_page.yml
|
185
|
+
- spec/parsed_document_spec.rb
|
186
|
+
- spec/parser_spec.rb
|
187
|
+
- spec/spec_helper.rb
|
188
|
+
- spec/spider_spec.rb
|
189
|
+
- spec/trawler_spec.rb
|