trawler 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +52 -0
- data/Rakefile +1 -0
- data/lib/trawler/document.rb +22 -0
- data/lib/trawler/parsed_document.rb +30 -0
- data/lib/trawler/parser.rb +94 -0
- data/lib/trawler/spider.rb +24 -0
- data/lib/trawler/version.rb +3 -0
- data/lib/trawler.rb +11 -0
- data/spec/document_spec.rb +31 -0
- data/spec/fixtures/sample_pages/simple.html +10 -0
- data/spec/fixtures/sample_pages/tumblr.html +840 -0
- data/spec/fixtures/sample_pages/youtube.html +1404 -0
- data/spec/fixtures/vcr_cassettes/trawl_page.yml +57 -0
- data/spec/parsed_document_spec.rb +32 -0
- data/spec/parser_spec.rb +100 -0
- data/spec/spec_helper.rb +26 -0
- data/spec/spider_spec.rb +52 -0
- data/spec/trawler_spec.rb +30 -0
- data/trawler.gemspec +30 -0
- metadata +189 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
---
|
|
2
|
+
http_interactions:
|
|
3
|
+
- request:
|
|
4
|
+
method: get
|
|
5
|
+
uri: http://www.cats.com/
|
|
6
|
+
body:
|
|
7
|
+
encoding: US-ASCII
|
|
8
|
+
string: ''
|
|
9
|
+
headers:
|
|
10
|
+
Accept-Encoding:
|
|
11
|
+
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
|
|
12
|
+
Accept:
|
|
13
|
+
- '*/*'
|
|
14
|
+
User-Agent:
|
|
15
|
+
- Ruby
|
|
16
|
+
response:
|
|
17
|
+
status:
|
|
18
|
+
code: 200
|
|
19
|
+
message: OK
|
|
20
|
+
headers:
|
|
21
|
+
Server:
|
|
22
|
+
- Apache
|
|
23
|
+
Set-Cookie:
|
|
24
|
+
- COOKIE=10.5.19.71.1382123959465997; path=/
|
|
25
|
+
- referrer=; path=/
|
|
26
|
+
- referrer=www.cats.com; path=/
|
|
27
|
+
- t=2fe697b0382a11e3a9020015c5e70714; path=/
|
|
28
|
+
Etag:
|
|
29
|
+
- '"AAAAUHDDtBw"'
|
|
30
|
+
Last-Modified:
|
|
31
|
+
- Wed, 16 Oct 2013 20:57:10 GMT
|
|
32
|
+
Vary:
|
|
33
|
+
- Accept-Encoding,User-Agent
|
|
34
|
+
Cartoon:
|
|
35
|
+
- aalander3
|
|
36
|
+
Content-Type:
|
|
37
|
+
- text/html; charset=UTF-8
|
|
38
|
+
Content-Length:
|
|
39
|
+
- '39'
|
|
40
|
+
Accept-Ranges:
|
|
41
|
+
- bytes
|
|
42
|
+
Date:
|
|
43
|
+
- Fri, 18 Oct 2013 19:19:19 GMT
|
|
44
|
+
X-Varnish:
|
|
45
|
+
- '2126042946'
|
|
46
|
+
Age:
|
|
47
|
+
- '0'
|
|
48
|
+
Via:
|
|
49
|
+
- 1.1 varnish
|
|
50
|
+
Connection:
|
|
51
|
+
- keep-alive
|
|
52
|
+
body:
|
|
53
|
+
encoding: UTF-8
|
|
54
|
+
string: "<html>\r\n<body>\r\n</body>\r\n</html>\r\n"
|
|
55
|
+
http_version:
|
|
56
|
+
recorded_at: Fri, 18 Oct 2013 19:19:14 GMT
|
|
57
|
+
recorded_with: VCR 2.6.0
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe Trawler::ParsedDocument do
|
|
4
|
+
|
|
5
|
+
let(:data) { double("PARSER", images: ["foo"], title: "foobar", video: "bar", description: "foobarbaz", document: "") }
|
|
6
|
+
let(:doc) { Trawler::ParsedDocument.new("www.cats.com", data) }
|
|
7
|
+
|
|
8
|
+
it "sets the url" do
|
|
9
|
+
expect(doc.url).to eq "www.cats.com"
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "has images" do
|
|
13
|
+
expect(doc.images).to be_a Array
|
|
14
|
+
expect(doc.images).to include "foo"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
it "has a title" do
|
|
18
|
+
expect(doc.title).to eq "foobar"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it "has a video" do
|
|
22
|
+
expect(doc.video).to eq "bar"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it "has a description" do
|
|
26
|
+
expect(doc.description).to eq "foobarbaz"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it "has the raw data" do
|
|
30
|
+
expect(doc.raw_data).to eq ""
|
|
31
|
+
end
|
|
32
|
+
end
|
data/spec/parser_spec.rb
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe Trawler::Parser do
|
|
4
|
+
context "image rich page" do
|
|
5
|
+
let(:page) { fixture("sample_pages/tumblr.html") }
|
|
6
|
+
let(:parser) do
|
|
7
|
+
Trawler::Parser.new(
|
|
8
|
+
page: page,
|
|
9
|
+
url: "http://www.foo.com",
|
|
10
|
+
image_size: "100"
|
|
11
|
+
)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
describe "#title" do
|
|
15
|
+
it "parses the page title" do
|
|
16
|
+
expect(parser.title).to eq "Dogshaming"
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
describe "#description" do
|
|
21
|
+
it "parses the description from the meta data" do
|
|
22
|
+
expect(parser.description).not_to be_empty
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
describe "#videos" do
|
|
27
|
+
it "returns nil if no videos are found" do
|
|
28
|
+
expect(parser.video).to be_nil
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
describe "#images" do
|
|
33
|
+
it "returns an array of images" do
|
|
34
|
+
expect(parser.images).to be_a Array
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it "all the images that are larger than the min image size" do
|
|
38
|
+
expect(parser.images.size).to eq 10
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
context "video page" do
|
|
44
|
+
let(:page) { fixture("sample_pages/youtube.html") }
|
|
45
|
+
let(:parser) do
|
|
46
|
+
Trawler::Parser.new(
|
|
47
|
+
page: page,
|
|
48
|
+
url: "http://www.foo.com",
|
|
49
|
+
image_size: "100"
|
|
50
|
+
)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it "gets the title" do
|
|
54
|
+
expect(parser.title).to eq "Single Page Web Applications: JavaScript End-to-End (The Hard Stuff)"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
it "gets the video" do
|
|
58
|
+
expect(parser.video).to eq "http://www.youtube.com/v/OrIFaWJ9Glo?version=3&autohide=1"
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
it "gets the description" do
|
|
62
|
+
expect(parser.description).to eq "In the old days, when websites were steam powered and exploded regularly, the web was simple, but slow. As it evolved it became more powerful, but harder on ..."
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
it "gets the images" do
|
|
66
|
+
expect(parser.images).to include "http://i1.ytimg.com/vi/OrIFaWJ9Glo/maxresdefault.jpg"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
context "document style page" do
|
|
71
|
+
let(:page) { fixture("sample_pages/simple.html") }
|
|
72
|
+
let(:parser) do
|
|
73
|
+
Trawler::Parser.new(
|
|
74
|
+
page: page,
|
|
75
|
+
url: "foobar",
|
|
76
|
+
image_size: "100"
|
|
77
|
+
)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
let(:description) do
|
|
81
|
+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed dapibus velit in lacus mollis vehicula nec a arcu"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
it "returns the description from the body of the text" do
|
|
85
|
+
expect(parser.description[0..120]).to match description
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
it "gets the title from the html" do
|
|
89
|
+
expect(parser.title).to eq "Super simple html page"
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it "gets no images" do
|
|
93
|
+
expect(parser.images).to be_empty
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
it "gets no video" do
|
|
97
|
+
expect(parser.video).to be_nil
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
data/spec/spec_helper.rb
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
require 'webmock/rspec'
|
|
2
|
+
require "simplecov"
|
|
3
|
+
require "json"
|
|
4
|
+
require "vcr"
|
|
5
|
+
|
|
6
|
+
SimpleCov.start do
|
|
7
|
+
add_filter "/spec/"
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
VCR.configure do |c|
|
|
11
|
+
c.cassette_library_dir = 'spec/fixtures/vcr_cassettes'
|
|
12
|
+
c.hook_into :webmock
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
$LOAD_PATH.unshift(File.expand_path("../..", __FILE__))
|
|
16
|
+
|
|
17
|
+
require 'lib/trawler'
|
|
18
|
+
|
|
19
|
+
def fixture_path(filename=nil)
|
|
20
|
+
path = File.expand_path("../fixtures", __FILE__)
|
|
21
|
+
filename.nil? ? path : File.join(path, filename)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def fixture(file)
|
|
25
|
+
File.read(File.join(fixture_path, file))
|
|
26
|
+
end
|
data/spec/spider_spec.rb
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe Trawler::Spider do
|
|
4
|
+
let(:spider) { Trawler::Spider.new("www.cats.com") }
|
|
5
|
+
|
|
6
|
+
it "assigns the url" do
|
|
7
|
+
expect(spider.instance_variable_get("@url")).to eq "www.cats.com"
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
describe "#full_url" do
|
|
11
|
+
context "without a full scheme" do
|
|
12
|
+
it "adds the protocol to the url" do
|
|
13
|
+
expect(spider.full_url).to eq "http://www.cats.com"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
context "with a full scheme" do
|
|
18
|
+
let(:spider) { Trawler::Spider.new("https://foo.com") }
|
|
19
|
+
it "returns the url" do
|
|
20
|
+
expect(spider.full_url).to eq "https://foo.com"
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
describe "#get_page" do
|
|
26
|
+
it "returns a string" do
|
|
27
|
+
VCR.use_cassette("trawl_page") do
|
|
28
|
+
expect(spider.get_page).to be_a StringIO
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
describe "#call" do
|
|
34
|
+
let(:crawled_spider) do
|
|
35
|
+
VCR.use_cassette("trawl_page") do
|
|
36
|
+
spider.call
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
it "returns the object" do
|
|
41
|
+
expect(crawled_spider).to be_a Trawler::Spider
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
it "has the full_url" do
|
|
45
|
+
expect(crawled_spider.full_url).to eq "http://www.cats.com"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
it "has the page" do
|
|
49
|
+
expect(crawled_spider.page).not_to be_nil
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require "spec_helper"
|
|
2
|
+
|
|
3
|
+
describe Trawler do
|
|
4
|
+
describe "#fetch" do
|
|
5
|
+
|
|
6
|
+
before do
|
|
7
|
+
Trawler::Spider.any_instance.stub(:get_page) { fixture("sample_pages/tumblr.html") }
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
let(:haul) { Trawler.fetch("http://www.dogshaming.com/") }
|
|
11
|
+
|
|
12
|
+
it "returns a ParsedDocument" do
|
|
13
|
+
expect(haul).to be_a(Trawler::ParsedDocument)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it "has the original url" do
|
|
17
|
+
expect(haul.url).to eq("http://www.dogshaming.com/")
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
[:video, :images, :description, :title].each do |attr|
|
|
21
|
+
it "has #{attr}" do
|
|
22
|
+
expect(haul).to respond_to(attr)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
it "returns an array of images" do
|
|
27
|
+
expect(haul.images).to be_a Array
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
data/trawler.gemspec
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'trawler/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = "trawler"
|
|
8
|
+
spec.version = Trawler::VERSION
|
|
9
|
+
spec.authors = ["Jon Wheeler"]
|
|
10
|
+
spec.email = ["jon@doejo.com"]
|
|
11
|
+
spec.description = %q{Scrapes the web}
|
|
12
|
+
spec.summary = %q{Scrape a url for images, video links and meta descriptions}
|
|
13
|
+
spec.homepage = %q{https://github.com/Jonwheeler/trawler}
|
|
14
|
+
spec.license = "MIT"
|
|
15
|
+
|
|
16
|
+
spec.files = `git ls-files`.split($/)
|
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
19
|
+
spec.require_paths = ["lib"]
|
|
20
|
+
|
|
21
|
+
spec.add_dependency "nokogiri", "~> 1.6"
|
|
22
|
+
spec.add_dependency "hashr", "~> 0.0.22"
|
|
23
|
+
|
|
24
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
|
25
|
+
spec.add_development_dependency "rake"
|
|
26
|
+
spec.add_development_dependency "rspec", "~> 2.14"
|
|
27
|
+
spec.add_development_dependency "webmock", "~> 1.14"
|
|
28
|
+
spec.add_development_dependency "vcr", "~> 2.6"
|
|
29
|
+
spec.add_development_dependency "simplecov", "~> 0.7"
|
|
30
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: trawler
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Jon Wheeler
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2013-10-18 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: nokogiri
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ~>
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '1.6'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ~>
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '1.6'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: hashr
|
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
|
30
|
+
requirements:
|
|
31
|
+
- - ~>
|
|
32
|
+
- !ruby/object:Gem::Version
|
|
33
|
+
version: 0.0.22
|
|
34
|
+
type: :runtime
|
|
35
|
+
prerelease: false
|
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
+
requirements:
|
|
38
|
+
- - ~>
|
|
39
|
+
- !ruby/object:Gem::Version
|
|
40
|
+
version: 0.0.22
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: bundler
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ~>
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '1.3'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ~>
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '1.3'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: rake
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - '>='
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '0'
|
|
62
|
+
type: :development
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - '>='
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '0'
|
|
69
|
+
- !ruby/object:Gem::Dependency
|
|
70
|
+
name: rspec
|
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
|
72
|
+
requirements:
|
|
73
|
+
- - ~>
|
|
74
|
+
- !ruby/object:Gem::Version
|
|
75
|
+
version: '2.14'
|
|
76
|
+
type: :development
|
|
77
|
+
prerelease: false
|
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
+
requirements:
|
|
80
|
+
- - ~>
|
|
81
|
+
- !ruby/object:Gem::Version
|
|
82
|
+
version: '2.14'
|
|
83
|
+
- !ruby/object:Gem::Dependency
|
|
84
|
+
name: webmock
|
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
|
86
|
+
requirements:
|
|
87
|
+
- - ~>
|
|
88
|
+
- !ruby/object:Gem::Version
|
|
89
|
+
version: '1.14'
|
|
90
|
+
type: :development
|
|
91
|
+
prerelease: false
|
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
93
|
+
requirements:
|
|
94
|
+
- - ~>
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: '1.14'
|
|
97
|
+
- !ruby/object:Gem::Dependency
|
|
98
|
+
name: vcr
|
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
|
100
|
+
requirements:
|
|
101
|
+
- - ~>
|
|
102
|
+
- !ruby/object:Gem::Version
|
|
103
|
+
version: '2.6'
|
|
104
|
+
type: :development
|
|
105
|
+
prerelease: false
|
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - ~>
|
|
109
|
+
- !ruby/object:Gem::Version
|
|
110
|
+
version: '2.6'
|
|
111
|
+
- !ruby/object:Gem::Dependency
|
|
112
|
+
name: simplecov
|
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
|
114
|
+
requirements:
|
|
115
|
+
- - ~>
|
|
116
|
+
- !ruby/object:Gem::Version
|
|
117
|
+
version: '0.7'
|
|
118
|
+
type: :development
|
|
119
|
+
prerelease: false
|
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
121
|
+
requirements:
|
|
122
|
+
- - ~>
|
|
123
|
+
- !ruby/object:Gem::Version
|
|
124
|
+
version: '0.7'
|
|
125
|
+
description: Scrapes the web
|
|
126
|
+
email:
|
|
127
|
+
- jon@doejo.com
|
|
128
|
+
executables: []
|
|
129
|
+
extensions: []
|
|
130
|
+
extra_rdoc_files: []
|
|
131
|
+
files:
|
|
132
|
+
- .gitignore
|
|
133
|
+
- .rspec
|
|
134
|
+
- Gemfile
|
|
135
|
+
- LICENSE.txt
|
|
136
|
+
- README.md
|
|
137
|
+
- Rakefile
|
|
138
|
+
- lib/trawler.rb
|
|
139
|
+
- lib/trawler/document.rb
|
|
140
|
+
- lib/trawler/parsed_document.rb
|
|
141
|
+
- lib/trawler/parser.rb
|
|
142
|
+
- lib/trawler/spider.rb
|
|
143
|
+
- lib/trawler/version.rb
|
|
144
|
+
- spec/document_spec.rb
|
|
145
|
+
- spec/fixtures/sample_pages/simple.html
|
|
146
|
+
- spec/fixtures/sample_pages/tumblr.html
|
|
147
|
+
- spec/fixtures/sample_pages/youtube.html
|
|
148
|
+
- spec/fixtures/vcr_cassettes/trawl_page.yml
|
|
149
|
+
- spec/parsed_document_spec.rb
|
|
150
|
+
- spec/parser_spec.rb
|
|
151
|
+
- spec/spec_helper.rb
|
|
152
|
+
- spec/spider_spec.rb
|
|
153
|
+
- spec/trawler_spec.rb
|
|
154
|
+
- trawler.gemspec
|
|
155
|
+
homepage: https://github.com/Jonwheeler/trawler
|
|
156
|
+
licenses:
|
|
157
|
+
- MIT
|
|
158
|
+
metadata: {}
|
|
159
|
+
post_install_message:
|
|
160
|
+
rdoc_options: []
|
|
161
|
+
require_paths:
|
|
162
|
+
- lib
|
|
163
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
164
|
+
requirements:
|
|
165
|
+
- - '>='
|
|
166
|
+
- !ruby/object:Gem::Version
|
|
167
|
+
version: '0'
|
|
168
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
169
|
+
requirements:
|
|
170
|
+
- - '>='
|
|
171
|
+
- !ruby/object:Gem::Version
|
|
172
|
+
version: '0'
|
|
173
|
+
requirements: []
|
|
174
|
+
rubyforge_project:
|
|
175
|
+
rubygems_version: 2.1.9
|
|
176
|
+
signing_key:
|
|
177
|
+
specification_version: 4
|
|
178
|
+
summary: Scrape a url for images, video links and meta descriptions
|
|
179
|
+
test_files:
|
|
180
|
+
- spec/document_spec.rb
|
|
181
|
+
- spec/fixtures/sample_pages/simple.html
|
|
182
|
+
- spec/fixtures/sample_pages/tumblr.html
|
|
183
|
+
- spec/fixtures/sample_pages/youtube.html
|
|
184
|
+
- spec/fixtures/vcr_cassettes/trawl_page.yml
|
|
185
|
+
- spec/parsed_document_spec.rb
|
|
186
|
+
- spec/parser_spec.rb
|
|
187
|
+
- spec/spec_helper.rb
|
|
188
|
+
- spec/spider_spec.rb
|
|
189
|
+
- spec/trawler_spec.rb
|