sagrone_scraper 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d067420377ca0e271b6ba7f8c00f5f6ae2198b85
4
- data.tar.gz: cc93b626d827b17e7f16fa91b7fd00b13936c318
3
+ metadata.gz: 82fc9ba674d9d3398b5f596d513fbb8eeb8abe3b
4
+ data.tar.gz: a8acac43dc318b6dcad951d57d3e1ce59478c67d
5
5
  SHA512:
6
- metadata.gz: 804b9c719e81d87b762f1cea45c3e1919d459d7520270ac1176907e3cb14efef3f992f24f2ed71db3baa85cae7e0fb3b4c5f18394785da396df772a6eeb59755
7
- data.tar.gz: a9b2524b7029896731942e13483e736b45115739d8cc22a7176e9afc477f6b3d460fbe8b9b88c577ae706a976ad10c30d49deba45cdfb9d65206085dd4459f3d
6
+ metadata.gz: 0dc5041f027f685ac241fcf0e103d3d8a6fa225002cdedd65a2f072fafb43904d3514a7cabad9a83bb0d3f02a3f7241c2c8dbeffdb48f133439f308319462870
7
+ data.tar.gz: 15644f0da27c4cb3f2452ac11958261acc7d0bd5e3e0539a7d7817b47f5c5096ea3f0d107b2f456f34f6fe8273fb7742128cd72f5c142c8416a521262ffb8e09
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ### HEAD
2
2
 
3
+ ### 0.0.2
4
+
5
+ - add `SagroneScraper::Parser`
6
+
3
7
  ### 0.0.1
4
8
 
5
9
  - add `SagroneScraper::Agent`
data/README.md CHANGED
@@ -1,7 +1,18 @@
1
- # SagroneScraper
1
+ # Sagrone scraper
2
+
3
+ [![Gem Version](https://badge.fury.io/rb/sagrone_scraper.svg)](http://badge.fury.io/rb/sagrone_scraper)
4
+ [![Build Status](https://travis-ci.org/Sagrone/scraper.svg?branch=master)](https://travis-ci.org/Sagrone/scraper)
2
5
 
3
6
  Simple library to scrap web pages. Bellow you will find information on [how to use it](#usage).
4
7
 
8
+ ## Table of Contents
9
+
10
+ - [Installation](#installation)
11
+ - [Basic Usage](#basic-usage)
12
+ - [Modules](#modules)
13
+ + [`SagroneScraper::Agent`](#sagronescraperagent)
14
+ + [`SagroneScraper::Parser`](#sagronescraperparser)
15
+
5
16
  ## Installation
6
17
 
7
18
  Add this line to your application's Gemfile:
@@ -16,13 +27,15 @@ Or install it yourself as:
16
27
 
17
28
  $ gem install sagrone_scraper
18
29
 
19
- ## Usage
30
+ ## Basic Usage
20
31
 
21
- #### `SagroneScraper::Agent`
32
+ Comming soon...
22
33
 
23
- The agent is responsible for scraping a web page from a URL.
34
+ ## Modules
24
35
 
25
- Here is how you can create an `agent`:
36
+ #### `SagroneScraper::Agent`
37
+
38
+ The agent is responsible for scraping a web page from a URL. Here is how you can create an `agent`:
26
39
 
27
40
  1. one way is to pass it a `url` option
28
41
 
@@ -54,6 +67,38 @@ Here is how you can create an `agent`:
54
67
  # => "Milan, Italy"
55
68
  ```
56
69
 
70
+ #### `SagroneScraper::Parser`
71
+
72
+ The _parser_ is responsible for extracting structured data from a _page_. The page can be obtained by the _agent_.
73
+
74
+ Example usage:
75
+
76
+ ```ruby
77
+ require 'sagrone_scraper/agent'
78
+ require 'sagrone_scraper/parser'
79
+
80
+ # 1) First define a custom parser, for example twitter.
81
+ class TwitterParser < SagroneScraper::Parser
82
+ def bio
83
+ page.at('.ProfileHeaderCard-bio').text
84
+ end
85
+
86
+ def location
87
+ page.at('.ProfileHeaderCard-locationText').text
88
+ end
89
+ end
90
+
91
+ # 2) Create an agent scraper, which will give us the page to parse.
92
+ agent = SagroneScraper::Agent.new(url: 'https://twitter.com/Milano_JS')
93
+
94
+ # 3) Instantiate the parser.
95
+ parser = TwitterParser.new(page: agent.page)
96
+
97
+ # 4) Parse page and extract attributes.
98
+ parser.parse_page!
99
+ parser.attributes
100
+ # => {bio: "Javascript User Group Milano #milanojs", location: "Milan, Italy"}
101
+ ```
57
102
 
58
103
  ## Contributing
59
104
 
@@ -0,0 +1,34 @@
1
+ require 'mechanize'
2
+
3
+ module SagroneScraper
4
+ class Parser
5
+ Error = Class.new(RuntimeError)
6
+
7
+ attr_reader :page, :attributes
8
+
9
+ def initialize(options = {})
10
+ @page = options.fetch(:page) do
11
+ raise Error.new('Option "page" must be provided.')
12
+ end
13
+ @attributes = {}
14
+ end
15
+
16
+ def parse_page!
17
+ self.class.method_names.each do |name|
18
+ attributes[name] = send(name)
19
+ end
20
+ nil
21
+ end
22
+
23
+ private
24
+
25
+ def self.method_names
26
+ @method_names ||= []
27
+ end
28
+
29
+ def self.method_added(name)
30
+ puts "added #{name} to #{self}"
31
+ method_names.push(name)
32
+ end
33
+ end
34
+ end
@@ -1,3 +1,3 @@
1
1
  module SagroneScraper
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -78,7 +78,7 @@ RSpec.describe SagroneScraper::Agent do
78
78
 
79
79
  webmock_allow do
80
80
  expect { agent }.to raise_error(SagroneScraper::Agent::Error,
81
- /getaddrinfo: nodename nor servname provided, or not known/)
81
+ /getaddrinfo/)
82
82
  end
83
83
  end
84
84
  end
@@ -0,0 +1,72 @@
1
+ require 'spec_helper'
2
+ require 'sagrone_scraper/parser'
3
+
4
+ RSpec.describe SagroneScraper::Parser do
5
+ describe '#initialize' do
6
+ it 'requires a "page" option' do
7
+ expect { described_class.new }.to raise_error(SagroneScraper::Parser::Error, /Option "page" must be provided./)
8
+ end
9
+ end
10
+
11
+ describe 'instance methods' do
12
+ let(:page) { Mechanize::Page.new }
13
+ let(:parser) { described_class.new(page: page) }
14
+
15
+ describe '#page' do
16
+ it { expect(parser.page).to be_a(Mechanize::Page) }
17
+ end
18
+
19
+ describe '#parse_page!' do
20
+ it { expect(parser.parse_page!).to eq nil }
21
+ end
22
+
23
+ describe '#attributes' do
24
+ it { expect(parser.attributes).to be_empty }
25
+ end
26
+ end
27
+
28
+ describe 'create custom TwitterParser from SagroneScraper::Parser' do
29
+ class TwitterParser < SagroneScraper::Parser
30
+ def bio
31
+ page.at('.ProfileHeaderCard-bio').text
32
+ end
33
+
34
+ def location
35
+ page.at('.ProfileHeaderCard-locationText').text
36
+ end
37
+ end
38
+
39
+ before do
40
+ stub_request_for('https://twitter.com/Milano_JS', 'twitter.com:Milano_JS')
41
+ end
42
+
43
+ let(:page) { Mechanize.new.get('https://twitter.com/Milano_JS') }
44
+ let(:twitter_parser) { TwitterParser.new(page: page) }
45
+ let(:expected_attributes) do
46
+ {
47
+ bio: "Javascript User Group Milano #milanojs",
48
+ location: "Milan, Italy"
49
+ }
50
+ end
51
+
52
+ describe 'should be able to parse page without errors' do
53
+ it { expect { twitter_parser.parse_page! }.to_not raise_error }
54
+ end
55
+
56
+ it 'should have attributes present after parsing' do
57
+ twitter_parser.parse_page!
58
+
59
+ expect(twitter_parser.attributes).to_not be_empty
60
+ expect(twitter_parser.attributes).to eq expected_attributes
61
+ end
62
+
63
+ it 'should have correct attributes event if parsing is done multiple times' do
64
+ twitter_parser.parse_page!
65
+ twitter_parser.parse_page!
66
+ twitter_parser.parse_page!
67
+
68
+ expect(twitter_parser.attributes).to_not be_empty
69
+ expect(twitter_parser.attributes).to eq expected_attributes
70
+ end
71
+ end
72
+ end
@@ -5,4 +5,4 @@ RSpec.describe SagroneScraper do
5
5
  describe '.version' do
6
6
  it { expect(SagroneScraper.version).to be_a(String) }
7
7
  end
8
- end
8
+ end