sagrone_scraper 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +47 -5
- data/lib/sagrone_scraper.rb +34 -0
- data/lib/sagrone_scraper/parser.rb +9 -1
- data/lib/sagrone_scraper/version.rb +1 -1
- data/spec/sagrone_scraper/agent_spec.rb +6 -4
- data/spec/sagrone_scraper/parser_spec.rb +21 -10
- data/spec/sagrone_scraper_spec.rb +73 -0
- data/spec/spec_helper.rb +2 -0
- data/spec/stub_helper.rb +1 -1
- data/spec/support/test_parsers/twitter_parser.rb +17 -0
- data/spec/{test_responses → support/test_responses}/twitter.com:Milano_JS +0 -0
- data/spec/{test_responses → support/test_responses}/www.example.com +0 -0
- metadata +8 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 80b3c30080aba0c8b1da8cfdcdefbb8e6ef527e1
|
4
|
+
data.tar.gz: c9992757e44377ed3081089f0348cdf1535e8a8e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 97477b7732ec3485aa7ba5ef2c7cb16ac130d6b1a1f6ee8b57deb5cf53fb6ae50bafdec13d1c575dbc382a748583a3e70b0da55660bd7065daaebc1479d466c4
|
7
|
+
data.tar.gz: fc8762b63b3429dcbd004bbdc39c6ef5bd7351e8b483ec8185c79b985ab5b2091601005a533270fc511e16d329f31a20616e1ea068f2b7ce7c3a0b3928087c5f
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,10 @@
|
|
1
1
|
### HEAD
|
2
2
|
|
3
|
+
### 0.0.3
|
4
|
+
|
5
|
+
- add `SagroneScraper::Parser.can_parse?(url)` class method, which must be implemented in subclasses
|
6
|
+
- add `SagroneScraper` logic to _scrape_ a URL based on a set of _registered parsers_
|
7
|
+
|
3
8
|
### 0.0.2
|
4
9
|
|
5
10
|
- add `SagroneScraper::Parser`
|
data/README.md
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
[](http://badge.fury.io/rb/sagrone_scraper)
|
4
4
|
[](https://travis-ci.org/Sagrone/scraper)
|
5
5
|
|
6
|
-
Simple library to scrap web pages. Bellow you will find information on [how to use it](#usage).
|
6
|
+
Simple library to scrap web pages. Bellow you will find information on [how to use it](#basic-usage).
|
7
7
|
|
8
8
|
## Table of Contents
|
9
9
|
|
@@ -12,6 +12,7 @@ Simple library to scrap web pages. Bellow you will find information on [how to u
|
|
12
12
|
- [Modules](#modules)
|
13
13
|
+ [`SagroneScraper::Agent`](#sagronescraperagent)
|
14
14
|
+ [`SagroneScraper::Parser`](#sagronescraperparser)
|
15
|
+
+ [`SagroneScraper.scrape`](#sagronescraperscrape)
|
15
16
|
|
16
17
|
## Installation
|
17
18
|
|
@@ -40,7 +41,7 @@ The agent is responsible for scraping a web page from a URL. Here is how you can
|
|
40
41
|
1. one way is to pass it a `url` option
|
41
42
|
|
42
43
|
```ruby
|
43
|
-
require 'sagrone_scraper
|
44
|
+
require 'sagrone_scraper'
|
44
45
|
|
45
46
|
agent = SagroneScraper::Agent.new(url: 'https://twitter.com/Milano_JS')
|
46
47
|
agent.page
|
@@ -53,7 +54,7 @@ The agent is responsible for scraping a web page from a URL. Here is how you can
|
|
53
54
|
2. another way is to pass a `page` option (`Mechanize::Page`)
|
54
55
|
|
55
56
|
```ruby
|
56
|
-
require 'sagrone_scraper
|
57
|
+
require 'sagrone_scraper'
|
57
58
|
|
58
59
|
mechanize_agent = Mechanize.new { |agent| agent.user_agent_alias = 'Linux Firefox' }
|
59
60
|
page = mechanize_agent.get('https://twitter.com/Milano_JS')
|
@@ -74,11 +75,16 @@ The _parser_ is responsible for extracting structured data from a _page_. The pa
|
|
74
75
|
Example usage:
|
75
76
|
|
76
77
|
```ruby
|
77
|
-
require 'sagrone_scraper
|
78
|
-
require 'sagrone_scraper/parser'
|
78
|
+
require 'sagrone_scraper'
|
79
79
|
|
80
80
|
# 1) First define a custom parser, for example twitter.
|
81
81
|
class TwitterParser < SagroneScraper::Parser
|
82
|
+
TWITTER_PROFILE_URL = /^https?:\/\/twitter.com\/(\w)+\/?$/i
|
83
|
+
|
84
|
+
def self.can_parse?(url)
|
85
|
+
url.match(TWITTER_PROFILE_URL)
|
86
|
+
end
|
87
|
+
|
82
88
|
def bio
|
83
89
|
page.at('.ProfileHeaderCard-bio').text
|
84
90
|
end
|
@@ -100,6 +106,42 @@ parser.attributes
|
|
100
106
|
# => {bio: "Javascript User Group Milano #milanojs", location: "Milan, Italy"}
|
101
107
|
```
|
102
108
|
|
109
|
+
#### `SagroneScraper.scrape`
|
110
|
+
|
111
|
+
This is the simplest way to scrape a web page:
|
112
|
+
|
113
|
+
```ruby
|
114
|
+
require 'sagrone_scraper'
|
115
|
+
|
116
|
+
# 1) First we define a custom parser, for example twitter.
|
117
|
+
class TwitterParser < SagroneScraper::Parser
|
118
|
+
TWITTER_PROFILE_URL = /^https?:\/\/twitter.com\/(\w)+\/?$/i
|
119
|
+
|
120
|
+
def self.can_parse?(url)
|
121
|
+
url.match(TWITTER_PROFILE_URL)
|
122
|
+
end
|
123
|
+
|
124
|
+
def bio
|
125
|
+
page.at('.ProfileHeaderCard-bio').text
|
126
|
+
end
|
127
|
+
|
128
|
+
def location
|
129
|
+
page.at('.ProfileHeaderCard-locationText').text
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# 2) We register the parser.
|
134
|
+
SagroneScraper.register_parser('TwitterParser')
|
135
|
+
|
136
|
+
# 3) We can query for registered parsers.
|
137
|
+
SagroneScraper.registered_parsers
|
138
|
+
# => ['TwitterParser']
|
139
|
+
|
140
|
+
# 4) We can now scrape twitter profile URLs.
|
141
|
+
SagroneScraper.scrape(url: 'https://twitter.com/Milano_JS')
|
142
|
+
# => {bio: "Javascript User Group Milano #milanojs", location: "Milan, Italy"}
|
143
|
+
```
|
144
|
+
|
103
145
|
## Contributing
|
104
146
|
|
105
147
|
1. Fork it ( https://github.com/[my-github-username]/sagrone_scraper/fork )
|
data/lib/sagrone_scraper.rb
CHANGED
@@ -1,7 +1,41 @@
|
|
1
1
|
require "sagrone_scraper/version"
|
2
|
+
require "sagrone_scraper/agent"
|
3
|
+
require "sagrone_scraper/parser"
|
2
4
|
|
3
5
|
module SagroneScraper
|
6
|
+
Error = Class.new(RuntimeError)
|
7
|
+
|
4
8
|
def self.version
|
5
9
|
VERSION
|
6
10
|
end
|
11
|
+
|
12
|
+
def self.registered_parsers
|
13
|
+
@registered_parsers ||= []
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.register_parser(name)
|
17
|
+
return if registered_parsers.include?(name)
|
18
|
+
|
19
|
+
parser_class = Object.const_get(name)
|
20
|
+
raise Error.new("Expected parser to be a SagroneScraper::Parser.") unless parser_class.ancestors.include?(SagroneScraper::Parser)
|
21
|
+
|
22
|
+
registered_parsers.push(name)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.scrape(options)
|
26
|
+
url = options.fetch(:url) do
|
27
|
+
raise Error.new('Option "url" must be provided.')
|
28
|
+
end
|
29
|
+
|
30
|
+
parser_class = registered_parsers
|
31
|
+
.map { |parser_name| Object.const_get(parser_name) }
|
32
|
+
.find { |parser_class| parser_class.can_parse?(url) }
|
33
|
+
|
34
|
+
raise Error.new("No registed parser can parse URL #{url}") unless parser_class
|
35
|
+
|
36
|
+
agent = SagroneScraper::Agent.new(url: url)
|
37
|
+
parser = parser_class.new(page: agent.page)
|
38
|
+
parser.parse_page!
|
39
|
+
parser.attributes
|
40
|
+
end
|
7
41
|
end
|
@@ -4,22 +4,30 @@ module SagroneScraper
|
|
4
4
|
class Parser
|
5
5
|
Error = Class.new(RuntimeError)
|
6
6
|
|
7
|
-
attr_reader :page, :attributes
|
7
|
+
attr_reader :page, :page_url, :attributes
|
8
8
|
|
9
9
|
def initialize(options = {})
|
10
10
|
@page = options.fetch(:page) do
|
11
11
|
raise Error.new('Option "page" must be provided.')
|
12
12
|
end
|
13
|
+
@page_url = @page.uri.to_s
|
13
14
|
@attributes = {}
|
14
15
|
end
|
15
16
|
|
16
17
|
def parse_page!
|
18
|
+
return unless self.class.can_parse?(page_url)
|
19
|
+
|
17
20
|
self.class.method_names.each do |name|
|
18
21
|
attributes[name] = send(name)
|
19
22
|
end
|
20
23
|
nil
|
21
24
|
end
|
22
25
|
|
26
|
+
def self.can_parse?(url)
|
27
|
+
class_with_method = "#{self}.can_parse?(url)"
|
28
|
+
raise NotImplementedError.new("Expected #{class_with_method} to be implemented.")
|
29
|
+
end
|
30
|
+
|
23
31
|
private
|
24
32
|
|
25
33
|
def self.method_names
|
@@ -27,8 +27,10 @@ RSpec.describe SagroneScraper::Agent do
|
|
27
27
|
end
|
28
28
|
|
29
29
|
it 'when options is empty' do
|
30
|
-
expect {
|
31
|
-
|
30
|
+
expect {
|
31
|
+
described_class.new
|
32
|
+
}.to raise_error(SagroneScraper::Agent::Error,
|
33
|
+
'Exactly one option must be provided: "url" or "page"')
|
32
34
|
end
|
33
35
|
|
34
36
|
it 'when both options are present' do
|
@@ -37,7 +39,7 @@ RSpec.describe SagroneScraper::Agent do
|
|
37
39
|
expect {
|
38
40
|
described_class.new(url: 'http://example.com', page: page)
|
39
41
|
}.to raise_error(SagroneScraper::Agent::Error,
|
40
|
-
|
42
|
+
'Exactly one option must be provided: "url" or "page"')
|
41
43
|
end
|
42
44
|
end
|
43
45
|
|
@@ -61,7 +63,7 @@ RSpec.describe SagroneScraper::Agent do
|
|
61
63
|
@invalid_url = 'not-a-url'
|
62
64
|
|
63
65
|
expect { agent }.to raise_error(SagroneScraper::Agent::Error,
|
64
|
-
|
66
|
+
'absolute URL needed (not not-a-url)')
|
65
67
|
end
|
66
68
|
|
67
69
|
it 'should require absolute path' do
|
@@ -4,7 +4,9 @@ require 'sagrone_scraper/parser'
|
|
4
4
|
RSpec.describe SagroneScraper::Parser do
|
5
5
|
describe '#initialize' do
|
6
6
|
it 'requires a "page" option' do
|
7
|
-
expect {
|
7
|
+
expect {
|
8
|
+
described_class.new
|
9
|
+
}.to raise_error(SagroneScraper::Parser::Error, 'Option "page" must be provided.')
|
8
10
|
end
|
9
11
|
end
|
10
12
|
|
@@ -16,8 +18,17 @@ RSpec.describe SagroneScraper::Parser do
|
|
16
18
|
it { expect(parser.page).to be_a(Mechanize::Page) }
|
17
19
|
end
|
18
20
|
|
21
|
+
describe '#page_url' do
|
22
|
+
it { expect(parser.page_url).to be }
|
23
|
+
it { expect(parser.page_url).to eq page.uri.to_s }
|
24
|
+
end
|
25
|
+
|
19
26
|
describe '#parse_page!' do
|
20
|
-
it
|
27
|
+
it do
|
28
|
+
expect {
|
29
|
+
parser.parse_page!
|
30
|
+
}.to raise_error(NotImplementedError, "Expected #{described_class}.can_parse?(url) to be implemented.")
|
31
|
+
end
|
21
32
|
end
|
22
33
|
|
23
34
|
describe '#attributes' do
|
@@ -25,17 +36,17 @@ RSpec.describe SagroneScraper::Parser do
|
|
25
36
|
end
|
26
37
|
end
|
27
38
|
|
28
|
-
describe '
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
def location
|
35
|
-
page.at('.ProfileHeaderCard-locationText').text
|
39
|
+
describe 'class methods' do
|
40
|
+
describe '.can_parse?(url)' do
|
41
|
+
it do
|
42
|
+
expect {
|
43
|
+
described_class.can_parse?('url')
|
44
|
+
}.to raise_error(NotImplementedError, "Expected #{described_class}.can_parse?(url) to be implemented.")
|
36
45
|
end
|
37
46
|
end
|
47
|
+
end
|
38
48
|
|
49
|
+
describe 'create custom TwitterParser from SagroneScraper::Parser' do
|
39
50
|
before do
|
40
51
|
stub_request_for('https://twitter.com/Milano_JS', 'twitter.com:Milano_JS')
|
41
52
|
end
|
@@ -5,4 +5,77 @@ RSpec.describe SagroneScraper do
|
|
5
5
|
describe '.version' do
|
6
6
|
it { expect(SagroneScraper.version).to be_a(String) }
|
7
7
|
end
|
8
|
+
|
9
|
+
context 'parsers registered' do
|
10
|
+
before do
|
11
|
+
described_class.registered_parsers.clear
|
12
|
+
end
|
13
|
+
|
14
|
+
describe '.registered_parsers' do
|
15
|
+
it { expect(described_class.registered_parsers).to be_empty }
|
16
|
+
it { expect(described_class.registered_parsers).to be_a(Array) }
|
17
|
+
end
|
18
|
+
|
19
|
+
describe '.register_parser(name)' do
|
20
|
+
TestParser = Class.new(SagroneScraper::Parser)
|
21
|
+
NotParser = Class.new
|
22
|
+
|
23
|
+
it 'should check parser name is an existing constant' do
|
24
|
+
expect {
|
25
|
+
described_class.register_parser('Unknown')
|
26
|
+
}.to raise_error(NameError, 'uninitialized constant Unknown')
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should check parser class inherits from SagroneScraper::Parser' do
|
30
|
+
expect {
|
31
|
+
described_class.register_parser('NotParser')
|
32
|
+
}.to raise_error(SagroneScraper::Error, 'Expected parser to be a SagroneScraper::Parser.')
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'after adding a "parser" should have it registered' do
|
36
|
+
described_class.register_parser('TestParser')
|
37
|
+
|
38
|
+
expect(described_class.registered_parsers).to include('TestParser')
|
39
|
+
expect(described_class.registered_parsers.size).to eq 1
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'adding same "parser" multiple times should register it once' do
|
43
|
+
described_class.register_parser('TestParser')
|
44
|
+
described_class.register_parser('TestParser')
|
45
|
+
|
46
|
+
expect(described_class.registered_parsers).to include('TestParser')
|
47
|
+
expect(described_class.registered_parsers.size).to eq 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe '.scrape' do
|
53
|
+
before do
|
54
|
+
SagroneScraper.registered_parsers.clear
|
55
|
+
SagroneScraper.register_parser('TwitterParser')
|
56
|
+
|
57
|
+
stub_request_for('https://twitter.com/Milano_JS', 'twitter.com:Milano_JS')
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'should `url` option' do
|
61
|
+
expect {
|
62
|
+
described_class.scrape({})
|
63
|
+
}.to raise_error(SagroneScraper::Error, 'Option "url" must be provided.')
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should scrape URL if registered parser knows how to parse it' do
|
67
|
+
expected_attributes = {
|
68
|
+
bio: "Javascript User Group Milano #milanojs",
|
69
|
+
location: "Milan, Italy"
|
70
|
+
}
|
71
|
+
|
72
|
+
expect(described_class.scrape(url: 'https://twitter.com/Milano_JS')).to eq(expected_attributes)
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should return raise error if no registered paser can parse the URL' do
|
76
|
+
expect {
|
77
|
+
described_class.scrape(url: 'https://twitter.com/Milano_JS/media')
|
78
|
+
}.to raise_error(SagroneScraper::Error, "No registed parser can parse URL https://twitter.com/Milano_JS/media")
|
79
|
+
end
|
80
|
+
end
|
8
81
|
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/stub_helper.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'sagrone_scraper/parser'
|
2
|
+
|
3
|
+
class TwitterParser < SagroneScraper::Parser
|
4
|
+
TWITTER_PROFILE_URL = /^https?:\/\/twitter.com\/(\w)+\/?$/i
|
5
|
+
|
6
|
+
def self.can_parse?(url)
|
7
|
+
url.match(TWITTER_PROFILE_URL)
|
8
|
+
end
|
9
|
+
|
10
|
+
def bio
|
11
|
+
page.at('.ProfileHeaderCard-bio').text
|
12
|
+
end
|
13
|
+
|
14
|
+
def location
|
15
|
+
page.at('.ProfileHeaderCard-locationText').text
|
16
|
+
end
|
17
|
+
end
|
File without changes
|
File without changes
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sagrone_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Marius Colacioiu
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -121,8 +121,9 @@ files:
|
|
121
121
|
- spec/sagrone_scraper_spec.rb
|
122
122
|
- spec/spec_helper.rb
|
123
123
|
- spec/stub_helper.rb
|
124
|
-
- spec/
|
125
|
-
- spec/test_responses/
|
124
|
+
- spec/support/test_parsers/twitter_parser.rb
|
125
|
+
- spec/support/test_responses/twitter.com:Milano_JS
|
126
|
+
- spec/support/test_responses/www.example.com
|
126
127
|
homepage: ''
|
127
128
|
licenses:
|
128
129
|
- Apache License 2.0
|
@@ -153,5 +154,6 @@ test_files:
|
|
153
154
|
- spec/sagrone_scraper_spec.rb
|
154
155
|
- spec/spec_helper.rb
|
155
156
|
- spec/stub_helper.rb
|
156
|
-
- spec/
|
157
|
-
- spec/test_responses/
|
157
|
+
- spec/support/test_parsers/twitter_parser.rb
|
158
|
+
- spec/support/test_responses/twitter.com:Milano_JS
|
159
|
+
- spec/support/test_responses/www.example.com
|