sagrone_scraper 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +50 -5
- data/lib/sagrone_scraper/parser.rb +34 -0
- data/lib/sagrone_scraper/version.rb +1 -1
- data/spec/sagrone_scraper/agent_spec.rb +1 -1
- data/spec/sagrone_scraper/parser_spec.rb +72 -0
- data/spec/sagrone_scraper_spec.rb +1 -1
- data/spec/test_responses/twitter.com:Milano_JS +10123 -0
- metadata +6 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82fc9ba674d9d3398b5f596d513fbb8eeb8abe3b
|
4
|
+
data.tar.gz: a8acac43dc318b6dcad951d57d3e1ce59478c67d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0dc5041f027f685ac241fcf0e103d3d8a6fa225002cdedd65a2f072fafb43904d3514a7cabad9a83bb0d3f02a3f7241c2c8dbeffdb48f133439f308319462870
|
7
|
+
data.tar.gz: 15644f0da27c4cb3f2452ac11958261acc7d0bd5e3e0539a7d7817b47f5c5096ea3f0d107b2f456f34f6fe8273fb7742128cd72f5c142c8416a521262ffb8e09
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,7 +1,18 @@
|
|
1
|
-
#
|
1
|
+
# Sagrone scraper
|
2
|
+
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/sagrone_scraper.svg)](http://badge.fury.io/rb/sagrone_scraper)
|
4
|
+
[![Build Status](https://travis-ci.org/Sagrone/scraper.svg?branch=master)](https://travis-ci.org/Sagrone/scraper)
|
2
5
|
|
3
6
|
Simple library to scrap web pages. Bellow you will find information on [how to use it](#usage).
|
4
7
|
|
8
|
+
## Table of Contents
|
9
|
+
|
10
|
+
- [Installation](#installation)
|
11
|
+
- [Basic Usage](#basic-usage)
|
12
|
+
- [Modules](#modules)
|
13
|
+
+ [`SagroneScraper::Agent`](#sagronescraperagent)
|
14
|
+
+ [`SagroneScraper::Parser`](#sagronescraperparser)
|
15
|
+
|
5
16
|
## Installation
|
6
17
|
|
7
18
|
Add this line to your application's Gemfile:
|
@@ -16,13 +27,15 @@ Or install it yourself as:
|
|
16
27
|
|
17
28
|
$ gem install sagrone_scraper
|
18
29
|
|
19
|
-
## Usage
|
30
|
+
## Basic Usage
|
20
31
|
|
21
|
-
|
32
|
+
Comming soon...
|
22
33
|
|
23
|
-
|
34
|
+
## Modules
|
24
35
|
|
25
|
-
|
36
|
+
#### `SagroneScraper::Agent`
|
37
|
+
|
38
|
+
The agent is responsible for scraping a web page from a URL. Here is how you can create an `agent`:
|
26
39
|
|
27
40
|
1. one way is to pass it a `url` option
|
28
41
|
|
@@ -54,6 +67,38 @@ Here is how you can create an `agent`:
|
|
54
67
|
# => "Milan, Italy"
|
55
68
|
```
|
56
69
|
|
70
|
+
#### `SagroneScraper::Parser`
|
71
|
+
|
72
|
+
The _parser_ is responsible for extracting structured data from a _page_. The page can be obtained by the _agent_.
|
73
|
+
|
74
|
+
Example usage:
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
require 'sagrone_scraper/agent'
|
78
|
+
require 'sagrone_scraper/parser'
|
79
|
+
|
80
|
+
# 1) First define a custom parser, for example twitter.
|
81
|
+
class TwitterParser < SagroneScraper::Parser
|
82
|
+
def bio
|
83
|
+
page.at('.ProfileHeaderCard-bio').text
|
84
|
+
end
|
85
|
+
|
86
|
+
def location
|
87
|
+
page.at('.ProfileHeaderCard-locationText').text
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# 2) Create an agent scraper, which will give us the page to parse.
|
92
|
+
agent = SagroneScraper::Agent.new(url: 'https://twitter.com/Milano_JS')
|
93
|
+
|
94
|
+
# 3) Instantiate the parser.
|
95
|
+
parser = TwitterParser.new(page: agent.page)
|
96
|
+
|
97
|
+
# 4) Parse page and extract attributes.
|
98
|
+
parser.parse_page!
|
99
|
+
parser.attributes
|
100
|
+
# => {bio: "Javascript User Group Milano #milanojs", location: "Milan, Italy"}
|
101
|
+
```
|
57
102
|
|
58
103
|
## Contributing
|
59
104
|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
|
3
|
+
module SagroneScraper
|
4
|
+
class Parser
|
5
|
+
Error = Class.new(RuntimeError)
|
6
|
+
|
7
|
+
attr_reader :page, :attributes
|
8
|
+
|
9
|
+
def initialize(options = {})
|
10
|
+
@page = options.fetch(:page) do
|
11
|
+
raise Error.new('Option "page" must be provided.')
|
12
|
+
end
|
13
|
+
@attributes = {}
|
14
|
+
end
|
15
|
+
|
16
|
+
def parse_page!
|
17
|
+
self.class.method_names.each do |name|
|
18
|
+
attributes[name] = send(name)
|
19
|
+
end
|
20
|
+
nil
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def self.method_names
|
26
|
+
@method_names ||= []
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.method_added(name)
|
30
|
+
puts "added #{name} to #{self}"
|
31
|
+
method_names.push(name)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'sagrone_scraper/parser'
|
3
|
+
|
4
|
+
RSpec.describe SagroneScraper::Parser do
|
5
|
+
describe '#initialize' do
|
6
|
+
it 'requires a "page" option' do
|
7
|
+
expect { described_class.new }.to raise_error(SagroneScraper::Parser::Error, /Option "page" must be provided./)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
describe 'instance methods' do
|
12
|
+
let(:page) { Mechanize::Page.new }
|
13
|
+
let(:parser) { described_class.new(page: page) }
|
14
|
+
|
15
|
+
describe '#page' do
|
16
|
+
it { expect(parser.page).to be_a(Mechanize::Page) }
|
17
|
+
end
|
18
|
+
|
19
|
+
describe '#parse_page!' do
|
20
|
+
it { expect(parser.parse_page!).to eq nil }
|
21
|
+
end
|
22
|
+
|
23
|
+
describe '#attributes' do
|
24
|
+
it { expect(parser.attributes).to be_empty }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
describe 'create custom TwitterParser from SagroneScraper::Parser' do
|
29
|
+
class TwitterParser < SagroneScraper::Parser
|
30
|
+
def bio
|
31
|
+
page.at('.ProfileHeaderCard-bio').text
|
32
|
+
end
|
33
|
+
|
34
|
+
def location
|
35
|
+
page.at('.ProfileHeaderCard-locationText').text
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
before do
|
40
|
+
stub_request_for('https://twitter.com/Milano_JS', 'twitter.com:Milano_JS')
|
41
|
+
end
|
42
|
+
|
43
|
+
let(:page) { Mechanize.new.get('https://twitter.com/Milano_JS') }
|
44
|
+
let(:twitter_parser) { TwitterParser.new(page: page) }
|
45
|
+
let(:expected_attributes) do
|
46
|
+
{
|
47
|
+
bio: "Javascript User Group Milano #milanojs",
|
48
|
+
location: "Milan, Italy"
|
49
|
+
}
|
50
|
+
end
|
51
|
+
|
52
|
+
describe 'should be able to parse page without errors' do
|
53
|
+
it { expect { twitter_parser.parse_page! }.to_not raise_error }
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should have attributes present after parsing' do
|
57
|
+
twitter_parser.parse_page!
|
58
|
+
|
59
|
+
expect(twitter_parser.attributes).to_not be_empty
|
60
|
+
expect(twitter_parser.attributes).to eq expected_attributes
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'should have correct attributes event if parsing is done multiple times' do
|
64
|
+
twitter_parser.parse_page!
|
65
|
+
twitter_parser.parse_page!
|
66
|
+
twitter_parser.parse_page!
|
67
|
+
|
68
|
+
expect(twitter_parser.attributes).to_not be_empty
|
69
|
+
expect(twitter_parser.attributes).to eq expected_attributes
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|