trawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://www.cats.com/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ Accept-Encoding:
11
+ - gzip;q=1.0,deflate;q=0.6,identity;q=0.3
12
+ Accept:
13
+ - '*/*'
14
+ User-Agent:
15
+ - Ruby
16
+ response:
17
+ status:
18
+ code: 200
19
+ message: OK
20
+ headers:
21
+ Server:
22
+ - Apache
23
+ Set-Cookie:
24
+ - COOKIE=10.5.19.71.1382123959465997; path=/
25
+ - referrer=; path=/
26
+ - referrer=www.cats.com; path=/
27
+ - t=2fe697b0382a11e3a9020015c5e70714; path=/
28
+ Etag:
29
+ - '"AAAAUHDDtBw"'
30
+ Last-Modified:
31
+ - Wed, 16 Oct 2013 20:57:10 GMT
32
+ Vary:
33
+ - Accept-Encoding,User-Agent
34
+ Cartoon:
35
+ - aalander3
36
+ Content-Type:
37
+ - text/html; charset=UTF-8
38
+ Content-Length:
39
+ - '39'
40
+ Accept-Ranges:
41
+ - bytes
42
+ Date:
43
+ - Fri, 18 Oct 2013 19:19:19 GMT
44
+ X-Varnish:
45
+ - '2126042946'
46
+ Age:
47
+ - '0'
48
+ Via:
49
+ - 1.1 varnish
50
+ Connection:
51
+ - keep-alive
52
+ body:
53
+ encoding: UTF-8
54
+ string: "<html>\r\n<body>\r\n</body>\r\n</html>\r\n"
55
+ http_version:
56
+ recorded_at: Fri, 18 Oct 2013 19:19:14 GMT
57
+ recorded_with: VCR 2.6.0
@@ -0,0 +1,32 @@
1
+ require "spec_helper"
2
+
3
+ describe Trawler::ParsedDocument do
4
+
5
+ let(:data) { double("PARSER", images: ["foo"], title: "foobar", video: "bar", description: "foobarbaz", document: "") }
6
+ let(:doc) { Trawler::ParsedDocument.new("www.cats.com", data) }
7
+
8
+ it "sets the url" do
9
+ expect(doc.url).to eq "www.cats.com"
10
+ end
11
+
12
+ it "has images" do
13
+ expect(doc.images).to be_a Array
14
+ expect(doc.images).to include "foo"
15
+ end
16
+
17
+ it "has a title" do
18
+ expect(doc.title).to eq "foobar"
19
+ end
20
+
21
+ it "has a video" do
22
+ expect(doc.video).to eq "bar"
23
+ end
24
+
25
+ it "has a description" do
26
+ expect(doc.description).to eq "foobarbaz"
27
+ end
28
+
29
+ it "has the raw data" do
30
+ expect(doc.raw_data).to eq ""
31
+ end
32
+ end
@@ -0,0 +1,100 @@
1
+ require "spec_helper"
2
+
3
+ describe Trawler::Parser do
4
+ context "image rich page" do
5
+ let(:page) { fixture("sample_pages/tumblr.html") }
6
+ let(:parser) do
7
+ Trawler::Parser.new(
8
+ page: page,
9
+ url: "http://www.foo.com",
10
+ image_size: "100"
11
+ )
12
+ end
13
+
14
+ describe "#title" do
15
+ it "parses the page title" do
16
+ expect(parser.title).to eq "Dogshaming"
17
+ end
18
+ end
19
+
20
+ describe "#description" do
21
+ it "parses the description from the meta data" do
22
+ expect(parser.description).not_to be_empty
23
+ end
24
+ end
25
+
26
+ describe "#videos" do
27
+ it "returns nil if no videos are found" do
28
+ expect(parser.video).to be_nil
29
+ end
30
+ end
31
+
32
+ describe "#images" do
33
+ it "returns an array of images" do
34
+ expect(parser.images).to be_a Array
35
+ end
36
+
37
+ it "all the images that are larger than the min image size" do
38
+ expect(parser.images.size).to eq 10
39
+ end
40
+ end
41
+ end
42
+
43
+ context "video page" do
44
+ let(:page) { fixture("sample_pages/youtube.html") }
45
+ let(:parser) do
46
+ Trawler::Parser.new(
47
+ page: page,
48
+ url: "http://www.foo.com",
49
+ image_size: "100"
50
+ )
51
+ end
52
+
53
+ it "gets the title" do
54
+ expect(parser.title).to eq "Single Page Web Applications: JavaScript End-to-End (The Hard Stuff)"
55
+ end
56
+
57
+ it "gets the video" do
58
+ expect(parser.video).to eq "http://www.youtube.com/v/OrIFaWJ9Glo?version=3&autohide=1"
59
+ end
60
+
61
+ it "gets the description" do
62
+ expect(parser.description).to eq "In the old days, when websites were steam powered and exploded regularly, the web was simple, but slow. As it evolved it became more powerful, but harder on ..."
63
+ end
64
+
65
+ it "gets the images" do
66
+ expect(parser.images).to include "http://i1.ytimg.com/vi/OrIFaWJ9Glo/maxresdefault.jpg"
67
+ end
68
+ end
69
+
70
+ context "document style page" do
71
+ let(:page) { fixture("sample_pages/simple.html") }
72
+ let(:parser) do
73
+ Trawler::Parser.new(
74
+ page: page,
75
+ url: "foobar",
76
+ image_size: "100"
77
+ )
78
+ end
79
+
80
+ let(:description) do
81
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed dapibus velit in lacus mollis vehicula nec a arcu"
82
+ end
83
+
84
+ it "returns the description from the body of the text" do
85
+ expect(parser.description[0..120]).to match description
86
+ end
87
+
88
+ it "gets the title from the html" do
89
+ expect(parser.title).to eq "Super simple html page"
90
+ end
91
+
92
+ it "gets no images" do
93
+ expect(parser.images).to be_empty
94
+ end
95
+
96
+ it "gets no video" do
97
+ expect(parser.video).to be_nil
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,26 @@
1
+ require 'webmock/rspec'
2
+ require "simplecov"
3
+ require "json"
4
+ require "vcr"
5
+
6
+ SimpleCov.start do
7
+ add_filter "/spec/"
8
+ end
9
+
10
+ VCR.configure do |c|
11
+ c.cassette_library_dir = 'spec/fixtures/vcr_cassettes'
12
+ c.hook_into :webmock
13
+ end
14
+
15
+ $LOAD_PATH.unshift(File.expand_path("../..", __FILE__))
16
+
17
+ require 'lib/trawler'
18
+
19
+ def fixture_path(filename=nil)
20
+ path = File.expand_path("../fixtures", __FILE__)
21
+ filename.nil? ? path : File.join(path, filename)
22
+ end
23
+
24
+ def fixture(file)
25
+ File.read(File.join(fixture_path, file))
26
+ end
@@ -0,0 +1,52 @@
1
+ require "spec_helper"
2
+
3
+ describe Trawler::Spider do
4
+ let(:spider) { Trawler::Spider.new("www.cats.com") }
5
+
6
+ it "assigns the url" do
7
+ expect(spider.instance_variable_get("@url")).to eq "www.cats.com"
8
+ end
9
+
10
+ describe "#full_url" do
11
+ context "without a full scheme" do
12
+ it "adds the protocol to the url" do
13
+ expect(spider.full_url).to eq "http://www.cats.com"
14
+ end
15
+ end
16
+
17
+ context "with a full scheme" do
18
+ let(:spider) { Trawler::Spider.new("https://foo.com") }
19
+ it "returns the url" do
20
+ expect(spider.full_url).to eq "https://foo.com"
21
+ end
22
+ end
23
+ end
24
+
25
+ describe "#get_page" do
26
+ it "returns a string" do
27
+ VCR.use_cassette("trawl_page") do
28
+ expect(spider.get_page).to be_a StringIO
29
+ end
30
+ end
31
+ end
32
+
33
+ describe "#call" do
34
+ let(:crawled_spider) do
35
+ VCR.use_cassette("trawl_page") do
36
+ spider.call
37
+ end
38
+ end
39
+
40
+ it "returns the object" do
41
+ expect(crawled_spider).to be_a Trawler::Spider
42
+ end
43
+
44
+ it "has the full_url" do
45
+ expect(crawled_spider.full_url).to eq "http://www.cats.com"
46
+ end
47
+
48
+ it "has the page" do
49
+ expect(crawled_spider.page).not_to be_nil
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,30 @@
1
+ require "spec_helper"
2
+
3
+ describe Trawler do
4
+ describe "#fetch" do
5
+
6
+ before do
7
+ Trawler::Spider.any_instance.stub(:get_page) { fixture("sample_pages/tumblr.html") }
8
+ end
9
+
10
+ let(:haul) { Trawler.fetch("http://www.dogshaming.com/") }
11
+
12
+ it "returns a ParsedDocument" do
13
+ expect(haul).to be_a(Trawler::ParsedDocument)
14
+ end
15
+
16
+ it "has the original url" do
17
+ expect(haul.url).to eq("http://www.dogshaming.com/")
18
+ end
19
+
20
+ [:video, :images, :description, :title].each do |attr|
21
+ it "has #{attr}" do
22
+ expect(haul).to respond_to(attr)
23
+ end
24
+ end
25
+
26
+ it "returns an array of images" do
27
+ expect(haul.images).to be_a Array
28
+ end
29
+ end
30
+ end
data/trawler.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'trawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "trawler"
8
+ spec.version = Trawler::VERSION
9
+ spec.authors = ["Jon Wheeler"]
10
+ spec.email = ["jon@doejo.com"]
11
+ spec.description = %q{Scrapes the web}
12
+ spec.summary = %q{Scrape a url for images, video links and meta descriptions}
13
+ spec.homepage = %q{https://github.com/Jonwheeler/trawler}
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "nokogiri", "~> 1.6"
22
+ spec.add_dependency "hashr", "~> 0.0.22"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.3"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec", "~> 2.14"
27
+ spec.add_development_dependency "webmock", "~> 1.14"
28
+ spec.add_development_dependency "vcr", "~> 2.6"
29
+ spec.add_development_dependency "simplecov", "~> 0.7"
30
+ end
metadata ADDED
@@ -0,0 +1,189 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: trawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jon Wheeler
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-10-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: hashr
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.0.22
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.0.22
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.3'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.3'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ~>
74
+ - !ruby/object:Gem::Version
75
+ version: '2.14'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ~>
81
+ - !ruby/object:Gem::Version
82
+ version: '2.14'
83
+ - !ruby/object:Gem::Dependency
84
+ name: webmock
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: '1.14'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: '1.14'
97
+ - !ruby/object:Gem::Dependency
98
+ name: vcr
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ~>
102
+ - !ruby/object:Gem::Version
103
+ version: '2.6'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ~>
109
+ - !ruby/object:Gem::Version
110
+ version: '2.6'
111
+ - !ruby/object:Gem::Dependency
112
+ name: simplecov
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: '0.7'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ~>
123
+ - !ruby/object:Gem::Version
124
+ version: '0.7'
125
+ description: Scrapes the web
126
+ email:
127
+ - jon@doejo.com
128
+ executables: []
129
+ extensions: []
130
+ extra_rdoc_files: []
131
+ files:
132
+ - .gitignore
133
+ - .rspec
134
+ - Gemfile
135
+ - LICENSE.txt
136
+ - README.md
137
+ - Rakefile
138
+ - lib/trawler.rb
139
+ - lib/trawler/document.rb
140
+ - lib/trawler/parsed_document.rb
141
+ - lib/trawler/parser.rb
142
+ - lib/trawler/spider.rb
143
+ - lib/trawler/version.rb
144
+ - spec/document_spec.rb
145
+ - spec/fixtures/sample_pages/simple.html
146
+ - spec/fixtures/sample_pages/tumblr.html
147
+ - spec/fixtures/sample_pages/youtube.html
148
+ - spec/fixtures/vcr_cassettes/trawl_page.yml
149
+ - spec/parsed_document_spec.rb
150
+ - spec/parser_spec.rb
151
+ - spec/spec_helper.rb
152
+ - spec/spider_spec.rb
153
+ - spec/trawler_spec.rb
154
+ - trawler.gemspec
155
+ homepage: https://github.com/Jonwheeler/trawler
156
+ licenses:
157
+ - MIT
158
+ metadata: {}
159
+ post_install_message:
160
+ rdoc_options: []
161
+ require_paths:
162
+ - lib
163
+ required_ruby_version: !ruby/object:Gem::Requirement
164
+ requirements:
165
+ - - '>='
166
+ - !ruby/object:Gem::Version
167
+ version: '0'
168
+ required_rubygems_version: !ruby/object:Gem::Requirement
169
+ requirements:
170
+ - - '>='
171
+ - !ruby/object:Gem::Version
172
+ version: '0'
173
+ requirements: []
174
+ rubyforge_project:
175
+ rubygems_version: 2.1.9
176
+ signing_key:
177
+ specification_version: 4
178
+ summary: Scrape a url for images, video links and meta descriptions
179
+ test_files:
180
+ - spec/document_spec.rb
181
+ - spec/fixtures/sample_pages/simple.html
182
+ - spec/fixtures/sample_pages/tumblr.html
183
+ - spec/fixtures/sample_pages/youtube.html
184
+ - spec/fixtures/vcr_cassettes/trawl_page.yml
185
+ - spec/parsed_document_spec.rb
186
+ - spec/parser_spec.rb
187
+ - spec/spec_helper.rb
188
+ - spec/spider_spec.rb
189
+ - spec/trawler_spec.rb