sagrone_scraper 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d067420377ca0e271b6ba7f8c00f5f6ae2198b85
4
- data.tar.gz: cc93b626d827b17e7f16fa91b7fd00b13936c318
3
+ metadata.gz: 82fc9ba674d9d3398b5f596d513fbb8eeb8abe3b
4
+ data.tar.gz: a8acac43dc318b6dcad951d57d3e1ce59478c67d
5
5
  SHA512:
6
- metadata.gz: 804b9c719e81d87b762f1cea45c3e1919d459d7520270ac1176907e3cb14efef3f992f24f2ed71db3baa85cae7e0fb3b4c5f18394785da396df772a6eeb59755
7
- data.tar.gz: a9b2524b7029896731942e13483e736b45115739d8cc22a7176e9afc477f6b3d460fbe8b9b88c577ae706a976ad10c30d49deba45cdfb9d65206085dd4459f3d
6
+ metadata.gz: 0dc5041f027f685ac241fcf0e103d3d8a6fa225002cdedd65a2f072fafb43904d3514a7cabad9a83bb0d3f02a3f7241c2c8dbeffdb48f133439f308319462870
7
+ data.tar.gz: 15644f0da27c4cb3f2452ac11958261acc7d0bd5e3e0539a7d7817b47f5c5096ea3f0d107b2f456f34f6fe8273fb7742128cd72f5c142c8416a521262ffb8e09
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ### HEAD
2
2
 
3
+ ### 0.0.2
4
+
5
+ - add `SagroneScraper::Parser`
6
+
3
7
  ### 0.0.1
4
8
 
5
9
  - add `SagroneScraper::Agent`
data/README.md CHANGED
@@ -1,7 +1,18 @@
1
- # SagroneScraper
1
+ # Sagrone scraper
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/sagrone_scraper.svg)](http://badge.fury.io/rb/sagrone_scraper)
4
+ [![Build Status](https://travis-ci.org/Sagrone/scraper.svg?branch=master)](https://travis-ci.org/Sagrone/scraper)
2
5
 
3
6
  Simple library to scrap web pages. Bellow you will find information on [how to use it](#usage).
4
7
 
8
+ ## Table of Contents
9
+
10
+ - [Installation](#installation)
11
+ - [Basic Usage](#basic-usage)
12
+ - [Modules](#modules)
13
+ + [`SagroneScraper::Agent`](#sagronescraperagent)
14
+ + [`SagroneScraper::Parser`](#sagronescraperparser)
15
+
5
16
  ## Installation
6
17
 
7
18
  Add this line to your application's Gemfile:
@@ -16,13 +27,15 @@ Or install it yourself as:
16
27
 
17
28
  $ gem install sagrone_scraper
18
29
 
19
- ## Usage
30
+ ## Basic Usage
20
31
 
21
- #### `SagroneScraper::Agent`
32
+ Comming soon...
22
33
 
23
- The agent is responsible for scraping a web page from a URL.
34
+ ## Modules
24
35
 
25
- Here is how you can create an `agent`:
36
+ #### `SagroneScraper::Agent`
37
+
38
+ The agent is responsible for scraping a web page from a URL. Here is how you can create an `agent`:
26
39
 
27
40
  1. one way is to pass it a `url` option
28
41
 
@@ -54,6 +67,38 @@ Here is how you can create an `agent`:
54
67
  # => "Milan, Italy"
55
68
  ```
56
69
 
70
+ #### `SagroneScraper::Parser`
71
+
72
+ The _parser_ is responsible for extracting structured data from a _page_. The page can be obtained by the _agent_.
73
+
74
+ Example usage:
75
+
76
+ ```ruby
77
+ require 'sagrone_scraper/agent'
78
+ require 'sagrone_scraper/parser'
79
+
80
+ # 1) First define a custom parser, for example twitter.
81
+ class TwitterParser < SagroneScraper::Parser
82
+ def bio
83
+ page.at('.ProfileHeaderCard-bio').text
84
+ end
85
+
86
+ def location
87
+ page.at('.ProfileHeaderCard-locationText').text
88
+ end
89
+ end
90
+
91
+ # 2) Create an agent scraper, which will give us the page to parse.
92
+ agent = SagroneScraper::Agent.new(url: 'https://twitter.com/Milano_JS')
93
+
94
+ # 3) Instantiate the parser.
95
+ parser = TwitterParser.new(page: agent.page)
96
+
97
+ # 4) Parse page and extract attributes.
98
+ parser.parse_page!
99
+ parser.attributes
100
+ # => {bio: "Javascript User Group Milano #milanojs", location: "Milan, Italy"}
101
+ ```
57
102
 
58
103
  ## Contributing
59
104
 
@@ -0,0 +1,34 @@
1
+ require 'mechanize'
2
+
3
+ module SagroneScraper
4
+ class Parser
5
+ Error = Class.new(RuntimeError)
6
+
7
+ attr_reader :page, :attributes
8
+
9
+ def initialize(options = {})
10
+ @page = options.fetch(:page) do
11
+ raise Error.new('Option "page" must be provided.')
12
+ end
13
+ @attributes = {}
14
+ end
15
+
16
+ def parse_page!
17
+ self.class.method_names.each do |name|
18
+ attributes[name] = send(name)
19
+ end
20
+ nil
21
+ end
22
+
23
+ private
24
+
25
+ def self.method_names
26
+ @method_names ||= []
27
+ end
28
+
29
+ def self.method_added(name)
30
+ puts "added #{name} to #{self}"
31
+ method_names.push(name)
32
+ end
33
+ end
34
+ end
@@ -1,3 +1,3 @@
1
1
  module SagroneScraper
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -78,7 +78,7 @@ RSpec.describe SagroneScraper::Agent do
78
78
 
79
79
  webmock_allow do
80
80
  expect { agent }.to raise_error(SagroneScraper::Agent::Error,
81
- /getaddrinfo: nodename nor servname provided, or not known/)
81
+ /getaddrinfo/)
82
82
  end
83
83
  end
84
84
  end
@@ -0,0 +1,72 @@
1
+ require 'spec_helper'
2
+ require 'sagrone_scraper/parser'
3
+
4
+ RSpec.describe SagroneScraper::Parser do
5
+ describe '#initialize' do
6
+ it 'requires a "page" option' do
7
+ expect { described_class.new }.to raise_error(SagroneScraper::Parser::Error, /Option "page" must be provided./)
8
+ end
9
+ end
10
+
11
+ describe 'instance methods' do
12
+ let(:page) { Mechanize::Page.new }
13
+ let(:parser) { described_class.new(page: page) }
14
+
15
+ describe '#page' do
16
+ it { expect(parser.page).to be_a(Mechanize::Page) }
17
+ end
18
+
19
+ describe '#parse_page!' do
20
+ it { expect(parser.parse_page!).to eq nil }
21
+ end
22
+
23
+ describe '#attributes' do
24
+ it { expect(parser.attributes).to be_empty }
25
+ end
26
+ end
27
+
28
+ describe 'create custom TwitterParser from SagroneScraper::Parser' do
29
+ class TwitterParser < SagroneScraper::Parser
30
+ def bio
31
+ page.at('.ProfileHeaderCard-bio').text
32
+ end
33
+
34
+ def location
35
+ page.at('.ProfileHeaderCard-locationText').text
36
+ end
37
+ end
38
+
39
+ before do
40
+ stub_request_for('https://twitter.com/Milano_JS', 'twitter.com:Milano_JS')
41
+ end
42
+
43
+ let(:page) { Mechanize.new.get('https://twitter.com/Milano_JS') }
44
+ let(:twitter_parser) { TwitterParser.new(page: page) }
45
+ let(:expected_attributes) do
46
+ {
47
+ bio: "Javascript User Group Milano #milanojs",
48
+ location: "Milan, Italy"
49
+ }
50
+ end
51
+
52
+ describe 'should be able to parse page without errors' do
53
+ it { expect { twitter_parser.parse_page! }.to_not raise_error }
54
+ end
55
+
56
+ it 'should have attributes present after parsing' do
57
+ twitter_parser.parse_page!
58
+
59
+ expect(twitter_parser.attributes).to_not be_empty
60
+ expect(twitter_parser.attributes).to eq expected_attributes
61
+ end
62
+
63
+ it 'should have correct attributes event if parsing is done multiple times' do
64
+ twitter_parser.parse_page!
65
+ twitter_parser.parse_page!
66
+ twitter_parser.parse_page!
67
+
68
+ expect(twitter_parser.attributes).to_not be_empty
69
+ expect(twitter_parser.attributes).to eq expected_attributes
70
+ end
71
+ end
72
+ end
@@ -5,4 +5,4 @@ RSpec.describe SagroneScraper do
5
5
  describe '.version' do
6
6
  it { expect(SagroneScraper.version).to be_a(String) }
7
7
  end
8
- end
8
+ end