sagrone_scraper 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +50 -5
- data/lib/sagrone_scraper/parser.rb +34 -0
- data/lib/sagrone_scraper/version.rb +1 -1
- data/spec/sagrone_scraper/agent_spec.rb +1 -1
- data/spec/sagrone_scraper/parser_spec.rb +72 -0
- data/spec/sagrone_scraper_spec.rb +1 -1
- data/spec/test_responses/twitter.com:Milano_JS +10123 -0
- metadata +6 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82fc9ba674d9d3398b5f596d513fbb8eeb8abe3b
|
4
|
+
data.tar.gz: a8acac43dc318b6dcad951d57d3e1ce59478c67d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0dc5041f027f685ac241fcf0e103d3d8a6fa225002cdedd65a2f072fafb43904d3514a7cabad9a83bb0d3f02a3f7241c2c8dbeffdb48f133439f308319462870
|
7
|
+
data.tar.gz: 15644f0da27c4cb3f2452ac11958261acc7d0bd5e3e0539a7d7817b47f5c5096ea3f0d107b2f456f34f6fe8273fb7742128cd72f5c142c8416a521262ffb8e09
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,7 +1,18 @@
|
|
1
|
-
#
|
1
|
+
# Sagrone scraper
|
2
|
+
|
3
|
+
[](http://badge.fury.io/rb/sagrone_scraper)
|
4
|
+
[](https://travis-ci.org/Sagrone/scraper)
|
2
5
|
|
3
6
|
Simple library to scrap web pages. Bellow you will find information on [how to use it](#usage).
|
4
7
|
|
8
|
+
## Table of Contents
|
9
|
+
|
10
|
+
- [Installation](#installation)
|
11
|
+
- [Basic Usage](#basic-usage)
|
12
|
+
- [Modules](#modules)
|
13
|
+
+ [`SagroneScraper::Agent`](#sagronescraperagent)
|
14
|
+
+ [`SagroneScraper::Parser`](#sagronescraperparser)
|
15
|
+
|
5
16
|
## Installation
|
6
17
|
|
7
18
|
Add this line to your application's Gemfile:
|
@@ -16,13 +27,15 @@ Or install it yourself as:
|
|
16
27
|
|
17
28
|
$ gem install sagrone_scraper
|
18
29
|
|
19
|
-
## Usage
|
30
|
+
## Basic Usage
|
20
31
|
|
21
|
-
|
32
|
+
Comming soon...
|
22
33
|
|
23
|
-
|
34
|
+
## Modules
|
24
35
|
|
25
|
-
|
36
|
+
#### `SagroneScraper::Agent`
|
37
|
+
|
38
|
+
The agent is responsible for scraping a web page from a URL. Here is how you can create an `agent`:
|
26
39
|
|
27
40
|
1. one way is to pass it a `url` option
|
28
41
|
|
@@ -54,6 +67,38 @@ Here is how you can create an `agent`:
|
|
54
67
|
# => "Milan, Italy"
|
55
68
|
```
|
56
69
|
|
70
|
+
#### `SagroneScraper::Parser`
|
71
|
+
|
72
|
+
The _parser_ is responsible for extracting structured data from a _page_. The page can be obtained by the _agent_.
|
73
|
+
|
74
|
+
Example usage:
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
require 'sagrone_scraper/agent'
|
78
|
+
require 'sagrone_scraper/parser'
|
79
|
+
|
80
|
+
# 1) First define a custom parser, for example twitter.
|
81
|
+
class TwitterParser < SagroneScraper::Parser
|
82
|
+
def bio
|
83
|
+
page.at('.ProfileHeaderCard-bio').text
|
84
|
+
end
|
85
|
+
|
86
|
+
def location
|
87
|
+
page.at('.ProfileHeaderCard-locationText').text
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# 2) Create an agent scraper, which will give us the page to parse.
|
92
|
+
agent = SagroneScraper::Agent.new(url: 'https://twitter.com/Milano_JS')
|
93
|
+
|
94
|
+
# 3) Instantiate the parser.
|
95
|
+
parser = TwitterParser.new(page: agent.page)
|
96
|
+
|
97
|
+
# 4) Parse page and extract attributes.
|
98
|
+
parser.parse_page!
|
99
|
+
parser.attributes
|
100
|
+
# => {bio: "Javascript User Group Milano #milanojs", location: "Milan, Italy"}
|
101
|
+
```
|
57
102
|
|
58
103
|
## Contributing
|
59
104
|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module SagroneScraper
|
4
|
+
class Parser
|
5
|
+
Error = Class.new(RuntimeError)
|
6
|
+
|
7
|
+
attr_reader :page, :attributes
|
8
|
+
|
9
|
+
def initialize(options = {})
|
10
|
+
@page = options.fetch(:page) do
|
11
|
+
raise Error.new('Option "page" must be provided.')
|
12
|
+
end
|
13
|
+
@attributes = {}
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_page!
|
17
|
+
self.class.method_names.each do |name|
|
18
|
+
attributes[name] = send(name)
|
19
|
+
end
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def self.method_names
|
26
|
+
@method_names ||= []
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.method_added(name)
|
30
|
+
puts "added #{name} to #{self}"
|
31
|
+
method_names.push(name)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'sagrone_scraper/parser'
|
3
|
+
|
4
|
+
RSpec.describe SagroneScraper::Parser do
|
5
|
+
describe '#initialize' do
|
6
|
+
it 'requires a "page" option' do
|
7
|
+
expect { described_class.new }.to raise_error(SagroneScraper::Parser::Error, /Option "page" must be provided./)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
describe 'instance methods' do
|
12
|
+
let(:page) { Mechanize::Page.new }
|
13
|
+
let(:parser) { described_class.new(page: page) }
|
14
|
+
|
15
|
+
describe '#page' do
|
16
|
+
it { expect(parser.page).to be_a(Mechanize::Page) }
|
17
|
+
end
|
18
|
+
|
19
|
+
describe '#parse_page!' do
|
20
|
+
it { expect(parser.parse_page!).to eq nil }
|
21
|
+
end
|
22
|
+
|
23
|
+
describe '#attributes' do
|
24
|
+
it { expect(parser.attributes).to be_empty }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
describe 'create custom TwitterParser from SagroneScraper::Parser' do
|
29
|
+
class TwitterParser < SagroneScraper::Parser
|
30
|
+
def bio
|
31
|
+
page.at('.ProfileHeaderCard-bio').text
|
32
|
+
end
|
33
|
+
|
34
|
+
def location
|
35
|
+
page.at('.ProfileHeaderCard-locationText').text
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
before do
|
40
|
+
stub_request_for('https://twitter.com/Milano_JS', 'twitter.com:Milano_JS')
|
41
|
+
end
|
42
|
+
|
43
|
+
let(:page) { Mechanize.new.get('https://twitter.com/Milano_JS') }
|
44
|
+
let(:twitter_parser) { TwitterParser.new(page: page) }
|
45
|
+
let(:expected_attributes) do
|
46
|
+
{
|
47
|
+
bio: "Javascript User Group Milano #milanojs",
|
48
|
+
location: "Milan, Italy"
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
describe 'should be able to parse page without errors' do
|
53
|
+
it { expect { twitter_parser.parse_page! }.to_not raise_error }
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should have attributes present after parsing' do
|
57
|
+
twitter_parser.parse_page!
|
58
|
+
|
59
|
+
expect(twitter_parser.attributes).to_not be_empty
|
60
|
+
expect(twitter_parser.attributes).to eq expected_attributes
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'should have correct attributes event if parsing is done multiple times' do
|
64
|
+
twitter_parser.parse_page!
|
65
|
+
twitter_parser.parse_page!
|
66
|
+
twitter_parser.parse_page!
|
67
|
+
|
68
|
+
expect(twitter_parser.attributes).to_not be_empty
|
69
|
+
expect(twitter_parser.attributes).to eq expected_attributes
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|