scrapula 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/.rspec +1 -0
- data/.simplecov +1 -0
- data/CHANGELOG.md +15 -0
- data/CONTRIBUTING.md +0 -0
- data/Gemfile +24 -0
- data/Gemfile.lock +127 -0
- data/Guardfile +12 -0
- data/LICENSE +21 -0
- data/README.md +108 -0
- data/ROADMAP.md +42 -0
- data/Rakefile +30 -0
- data/examples/block_syntax.rb +20 -0
- data/examples/find_nodes.rb +6 -0
- data/examples/get_first_and_scrape_later.rb +13 -0
- data/examples/metas.rb +32 -0
- data/examples/more_api.rb +17 -0
- data/examples/nested_results.rb +14 -0
- data/examples/one_liners.rb +9 -0
- data/examples/posting_data.rb +7 -0
- data/examples/s.rb +24 -0
- data/examples/validation.rb +40 -0
- data/lib/scrapula.rb +47 -0
- data/lib/scrapula/_old_scraper.rb +110 -0
- data/lib/scrapula/agent.rb +8 -0
- data/lib/scrapula/data.rb +18 -0
- data/lib/scrapula/page.rb +109 -0
- data/lib/scrapula/page/meta.rb +74 -0
- data/lib/scrapula/request.rb +44 -0
- data/lib/scrapula/s.rb +21 -0
- data/lib/scrapula/scraper.rb +56 -0
- data/lib/scrapula/version.rb +3 -0
- data/scrapula.gemspec +36 -0
- data/spec/cassettes/Scrapula_Page_Meta/_.yml +748 -0
- data/spec/cassettes/Scrapula_Page_Meta/_/Open_Graph.yml +322 -0
- data/spec/cassettes/Scrapula_Page_Meta/_/other_names.yml +586 -0
- data/spec/cassettes/Scrapula_Page_Meta/_/standard_names.yml +429 -0
- data/spec/lib/scrapula/agent_spec.rb +6 -0
- data/spec/lib/scrapula/data_spec.rb +19 -0
- data/spec/lib/scrapula/page/meta_spec.rb +89 -0
- data/spec/lib/scrapula/page_spec.rb +136 -0
- data/spec/lib/scrapula/request_spec.rb +91 -0
- data/spec/lib/scrapula/s_spec.rb +44 -0
- data/spec/lib/scrapula/scraper_spec.rb +205 -0
- data/spec/lib/scrapula_spec.rb +141 -0
- data/spec/spec_helper.rb +26 -0
- metadata +118 -0
@@ -0,0 +1,89 @@
|
|
1
|
+
describe Scrapula::Page::Meta do
|
2
|
+
|
3
|
+
let(:page) { Scrapula::Request.new(url: url, method: :get).execute }
|
4
|
+
subject { described_class.new page }
|
5
|
+
|
6
|
+
describe '#initialize' do
|
7
|
+
end
|
8
|
+
|
9
|
+
describe '#[]' do
|
10
|
+
use_vcr_cassette
|
11
|
+
|
12
|
+
let(:url) { 'ogp.me' }
|
13
|
+
|
14
|
+
it 'symbols'
|
15
|
+
|
16
|
+
it 'returns nil when does not include a meta' do
|
17
|
+
expect(subject['yeah']).to eq nil
|
18
|
+
end
|
19
|
+
|
20
|
+
it '"charset" meta' do
|
21
|
+
expect(subject['charset']).to eq 'utf-8'
|
22
|
+
end
|
23
|
+
|
24
|
+
describe 'pragma directives' do
|
25
|
+
end
|
26
|
+
|
27
|
+
describe 'standard names' do
|
28
|
+
use_vcr_cassette
|
29
|
+
|
30
|
+
# FIXME other URL
|
31
|
+
let(:url) { 'http://ruby-doc.org/' }
|
32
|
+
|
33
|
+
it '"application-name" meta' do
|
34
|
+
expect(subject['application-name']).to eq 'Webby'
|
35
|
+
end
|
36
|
+
|
37
|
+
it '"author" meta' do
|
38
|
+
expect(subject['author']).to eq 'Webby'
|
39
|
+
end
|
40
|
+
|
41
|
+
it '"generator" meta' do
|
42
|
+
expect(subject['generator']).to eq 'Webby'
|
43
|
+
end
|
44
|
+
|
45
|
+
it '"description" meta' do
|
46
|
+
expect(subject['description']).to eq 'Fast and searchable Ruby documentation for core, standard libraries, and rubygems. Available in both RDoc and Yard format. Plus, links to tutorials, guides, books, and related sites.'
|
47
|
+
end
|
48
|
+
|
49
|
+
it '"keywords" meta' do
|
50
|
+
expect(subject['keywords']).to eq 'Webby'
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe 'Open Graph' do
|
55
|
+
use_vcr_cassette
|
56
|
+
|
57
|
+
let(:url) { 'ogp.me' }
|
58
|
+
|
59
|
+
it '"og:title" meta' do
|
60
|
+
expect(subject['og:title']).to eq 'Open Graph protocol'
|
61
|
+
end
|
62
|
+
|
63
|
+
it '"og:image:height" meta' do
|
64
|
+
expect(subject['og:image:height']).to eq '300'
|
65
|
+
end
|
66
|
+
|
67
|
+
context 'more than 1 property with the same name' do
|
68
|
+
# An array could be ambiguous
|
69
|
+
xit '"og:image:height" meta' do
|
70
|
+
expect(subject['og:image:height']).to eq '300'
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe 'other pragma directives' do
|
76
|
+
end
|
77
|
+
|
78
|
+
describe 'other names' do
|
79
|
+
use_vcr_cassette
|
80
|
+
|
81
|
+
let(:url) { 'https://dev.twitter.com/cards/markup' }
|
82
|
+
|
83
|
+
it '"twitter:title" meta' do
|
84
|
+
expect(subject['twitter:title']).to eq 'Cards Markup Tag Reference'
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
describe Scrapula::Page do
|
2
|
+
|
3
|
+
let(:query) { '.example' }
|
4
|
+
|
5
|
+
let(:agent_page_double) { instance_double Mechanize::Page }
|
6
|
+
|
7
|
+
subject { described_class.new agent_page_double }
|
8
|
+
|
9
|
+
describe '#initialize' do
|
10
|
+
it 'requires a Mechanize::Page'
|
11
|
+
end
|
12
|
+
|
13
|
+
describe '#scrape' do
|
14
|
+
|
15
|
+
let(:scraper_double) { instance_double Scrapula::Scraper }
|
16
|
+
|
17
|
+
let(:expect_new_scraper) {
|
18
|
+
allow(scraper_double).to receive(:data!)
|
19
|
+
expect(Scrapula::Scraper).to receive(:new).and_return scraper_double
|
20
|
+
}
|
21
|
+
|
22
|
+
it 'creates a scraper for the page' do
|
23
|
+
expect_new_scraper
|
24
|
+
subject.scrape & proc{}
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'returns the scrapped data' do
|
28
|
+
expect_new_scraper
|
29
|
+
expect(scraper_double).to receive(:data!).and_return example: 'example value'
|
30
|
+
|
31
|
+
result = subject.scrape do
|
32
|
+
example '#example'
|
33
|
+
end
|
34
|
+
|
35
|
+
expect(result).to eq({ example: 'example value' })
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe '#meta!' do
|
40
|
+
it 'returns a Scrapula::Meta object' do
|
41
|
+
subject.meta!
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# TODO
|
46
|
+
describe '#metas!' do
|
47
|
+
end
|
48
|
+
|
49
|
+
# TODO
|
50
|
+
describe '#search!' do
|
51
|
+
end
|
52
|
+
|
53
|
+
describe '#at!' do
|
54
|
+
|
55
|
+
let(:page_search_result_double) { instance_double Nokogiri::XML::NodeSet }
|
56
|
+
|
57
|
+
let(:expects_page_search) {
|
58
|
+
allow(agent_page_double).to receive(:at).and_return page_search_result_double
|
59
|
+
}
|
60
|
+
|
61
|
+
it 'requires a query' do
|
62
|
+
expect { subject.at! [] }.to raise_error
|
63
|
+
end
|
64
|
+
|
65
|
+
it 'performs the query' do
|
66
|
+
expects_page_search
|
67
|
+
expect(agent_page_double).to receive(:at)
|
68
|
+
|
69
|
+
subject.at! query
|
70
|
+
end
|
71
|
+
|
72
|
+
context 'without receiving an array with operations' do
|
73
|
+
it 'returns the query result' do
|
74
|
+
expects_page_search
|
75
|
+
|
76
|
+
expect(subject.at! query).to be page_search_result_double
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
context 'receiving an array with method names' do
|
81
|
+
|
82
|
+
let(:operations) { [:to_s, :upcase, :length] }
|
83
|
+
let(:example_string) { "You FOOL! This isn't even my final form!" }
|
84
|
+
let(:transformed_example) { example_string.upcase.length }
|
85
|
+
|
86
|
+
let(:expects_scraping_using_at) {
|
87
|
+
expects_page_search.with query
|
88
|
+
expect(page_search_result_double).to receive(:to_s).and_return example_string
|
89
|
+
}
|
90
|
+
|
91
|
+
context 'but not receiving a block' do
|
92
|
+
it 'applies them in order to the result' do
|
93
|
+
expects_scraping_using_at
|
94
|
+
|
95
|
+
expect(subject.at!(query, operations)).to eq transformed_example
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
context 'and a block' do
|
100
|
+
it 'applies them in order to the result and yields that value' do
|
101
|
+
expects_scraping_using_at
|
102
|
+
|
103
|
+
expect {|b| subject.at!(query, operations, &b) }.to yield_with_args transformed_example
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
describe '#html!' do
|
111
|
+
|
112
|
+
it 'performs the query' do
|
113
|
+
expect(subject).to receive(:at!).with query, any_args
|
114
|
+
subject.html! query
|
115
|
+
end
|
116
|
+
|
117
|
+
context 'without operations' do
|
118
|
+
it 'adds one for converting to HTML'
|
119
|
+
end
|
120
|
+
|
121
|
+
describe 'with operations' do
|
122
|
+
it 'prepends them for converting first to HTML'
|
123
|
+
end
|
124
|
+
|
125
|
+
describe 'block' do
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
describe '#text!' do
|
131
|
+
it 'performs the query'
|
132
|
+
|
133
|
+
it 'returns the inner text'
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
describe Scrapula::Request do
|
2
|
+
|
3
|
+
let(:url) { 'http://example.net' }
|
4
|
+
let(:data) { { url: url, method: 'get', params: [{ q: 'lol' }] } }
|
5
|
+
|
6
|
+
subject { described_class.new data }
|
7
|
+
|
8
|
+
describe '#initialize' do
|
9
|
+
|
10
|
+
describe 'requires an URL' do
|
11
|
+
%w[http https].each do |protocol|
|
12
|
+
context "receives one with the #{protocol.upcase} protocol" do
|
13
|
+
let(:url) { "#{protocol}://example.net" }
|
14
|
+
|
15
|
+
it 'maintains it' do
|
16
|
+
expect(described_class.new(url: url, method: :get).instance_variable_get :@url).to eq url
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
context 'receives one without protocol' do
|
22
|
+
let(:url) { 'example.net' }
|
23
|
+
|
24
|
+
it 'adds the HTTP one' do
|
25
|
+
expect(described_class.new(url: url, method: :get).instance_variable_get :@url).to eq "http://#{url}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'requires a valid HTTP method' do
|
31
|
+
end
|
32
|
+
|
33
|
+
describe 'can receive parameters' do
|
34
|
+
it 'as an array'
|
35
|
+
end
|
36
|
+
'or not'
|
37
|
+
|
38
|
+
context 'receives a block' do
|
39
|
+
it "doesn't invoke it now"
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
describe '#execute' do
|
45
|
+
|
46
|
+
let(:agent_double) { instance_double Scrapula::Agent }
|
47
|
+
|
48
|
+
let(:page_double) { instance_double Scrapula::Page }
|
49
|
+
|
50
|
+
let(:expects_agent) {
|
51
|
+
expect(subject).to receive(:agent).and_return agent_double
|
52
|
+
}
|
53
|
+
|
54
|
+
# TODO for each method
|
55
|
+
|
56
|
+
it 'performs an HTTP request using the attributes' do
|
57
|
+
expects_agent
|
58
|
+
|
59
|
+
expect(agent_double).to receive(:get) do |*args|
|
60
|
+
expect(args[0]).to eq data[:url]
|
61
|
+
expect(args[1]).to eq data[:params]
|
62
|
+
|
63
|
+
# TODO other parameters
|
64
|
+
end
|
65
|
+
|
66
|
+
subject.execute
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'returns the received page' do
|
70
|
+
expect(agent_double).to receive(:get).and_return page_double
|
71
|
+
expects_agent
|
72
|
+
|
73
|
+
subject.execute
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
# TODO agent
|
79
|
+
%w[url method params].each do |attribute|
|
80
|
+
|
81
|
+
describe "##{attribute}" do
|
82
|
+
it { is_expected.to respond_to attribute }
|
83
|
+
|
84
|
+
it "returns the established #{attribute} of the current request" do
|
85
|
+
expect(subject.__send__ attribute).to eq data[attribute.to_sym]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require_relative '../../../lib/scrapula/s'
|
2
|
+
|
3
|
+
describe 'S' do
|
4
|
+
|
5
|
+
describe '()' do
|
6
|
+
it 'implicit GET request' do
|
7
|
+
expect(Scrapula).to receive(:get)
|
8
|
+
S('url')
|
9
|
+
end
|
10
|
+
|
11
|
+
it 'returns'
|
12
|
+
|
13
|
+
context 'with block' do
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
Scrapula::HTTP_METHODS.each do |http_method|
|
18
|
+
describe ".#{http_method.to_s}" do
|
19
|
+
it "explicit #{http_method.to_s.upcase} request" do
|
20
|
+
expect(Scrapula).to receive(http_method)
|
21
|
+
S.__send__ http_method, 'url'
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'returns'
|
25
|
+
|
26
|
+
context 'with block' do
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
shortcuts = { g: 'get' }
|
32
|
+
|
33
|
+
shortcuts.keys.each do |aka|
|
34
|
+
http_method = shortcuts[aka]
|
35
|
+
|
36
|
+
describe ".#{aka.to_s}" do
|
37
|
+
it "#{aka.to_s} performans a #{http_method.to_s.upcase} request" do
|
38
|
+
expect(Scrapula).to receive(http_method)
|
39
|
+
S.__send__ aka, 'url'
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
@@ -0,0 +1,205 @@
|
|
1
|
+
describe Scrapula::Scraper do
|
2
|
+
|
3
|
+
subject { described_class.new page_double }
|
4
|
+
|
5
|
+
let(:page_double) {
|
6
|
+
instance_double Scrapula::Page
|
7
|
+
}
|
8
|
+
|
9
|
+
# TODO better test titles
|
10
|
+
describe '#initialize' do
|
11
|
+
|
12
|
+
xit 'requires a page' do
|
13
|
+
end
|
14
|
+
|
15
|
+
context 'without block' do
|
16
|
+
end
|
17
|
+
|
18
|
+
context 'with block' do
|
19
|
+
|
20
|
+
let(:allow_example_query) {
|
21
|
+
allow(page_double).to receive(:txt!).with('#example').and_return 'example value'
|
22
|
+
}
|
23
|
+
|
24
|
+
let(:allow_out_block) {
|
25
|
+
allow(page_double).to receive(:at!)
|
26
|
+
}
|
27
|
+
|
28
|
+
describe 'without any query' do
|
29
|
+
it 'returns an empty hash' do
|
30
|
+
result = described_class.new page_double do
|
31
|
+
end
|
32
|
+
expect(result.data!).to eq({})
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe 'and inner attribute' do
|
37
|
+
|
38
|
+
# TODO how to know if it's a value or a query and corner cases (using attr! 'value' for assigning?)
|
39
|
+
xcontext 'value' do
|
40
|
+
it 'assigns the value to hash' do
|
41
|
+
result = described_class.new page_double do
|
42
|
+
example 'value'
|
43
|
+
end
|
44
|
+
expected = { example: 'value' }
|
45
|
+
|
46
|
+
expect(result.data!).to eq expected
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
context 'query' do
|
51
|
+
|
52
|
+
shared_examples :query do
|
53
|
+
before(:each) {
|
54
|
+
expect(page_double).to receive(:txt!).with('.example').and_return expected_value
|
55
|
+
}
|
56
|
+
|
57
|
+
it 'extracts the text' do
|
58
|
+
described_class.new page_double do
|
59
|
+
example '.example'
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'assigns the result to a hash' do
|
64
|
+
result = described_class.new page_double do
|
65
|
+
example '.example'
|
66
|
+
end
|
67
|
+
expected = { example: expected_value }
|
68
|
+
|
69
|
+
expect(result.data!).to eq expected
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context "that doesn't find anything" do
|
74
|
+
it "doesn't crash"
|
75
|
+
it 'returns nil'
|
76
|
+
end
|
77
|
+
|
78
|
+
context 'that returns only one element' do
|
79
|
+
let(:expected_value) { 'example value' }
|
80
|
+
|
81
|
+
include_examples :query
|
82
|
+
end
|
83
|
+
|
84
|
+
# TODO
|
85
|
+
xcontext 'that returns several elements' do
|
86
|
+
let(:expected_value) { %w[value1 value2 value3] }
|
87
|
+
|
88
|
+
include_examples :query
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# TODO how to know if it's a value or a query and corner cases (using attr? for returning values)
|
93
|
+
xdescribe 'attributes can be invoked' do
|
94
|
+
it 'returns its previous value' do
|
95
|
+
allow_example_query
|
96
|
+
|
97
|
+
result = described_class.new page_double do
|
98
|
+
example '#example'
|
99
|
+
other example
|
100
|
+
end
|
101
|
+
expected = { example: 'example value', other: 'example value' }
|
102
|
+
|
103
|
+
expect(result.data!).to eq expected
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# TODO how should behaves? ignore the query?
|
109
|
+
xdescribe 'with query and inner query' do
|
110
|
+
|
111
|
+
it 'assigns the query result to a hash' do
|
112
|
+
allow_out_block.with '#outer', '#out'
|
113
|
+
allow_example_query
|
114
|
+
|
115
|
+
result = described_class.new page_double do
|
116
|
+
outer '#outer' do
|
117
|
+
out '#out' do
|
118
|
+
example '#example'
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
expected = { outer: { out: { example: 'example value' } } }
|
123
|
+
|
124
|
+
expect(result.data!).to eq expected
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
|
129
|
+
describe 'without query and inner query' do
|
130
|
+
|
131
|
+
it 'assigns the query result to a hash' do
|
132
|
+
allow_out_block
|
133
|
+
allow_example_query
|
134
|
+
|
135
|
+
result = described_class.new page_double do
|
136
|
+
outer do
|
137
|
+
out do
|
138
|
+
example '#example'
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
expected = { outer: { out: { example: 'example value' } } }
|
143
|
+
|
144
|
+
expect(result.data!).to eq expected
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
context 'with block with parameters' do
|
151
|
+
it 'yields the page' do
|
152
|
+
expect {|b| described_class.new page_double, &b }.to yield_with_args page_double
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
end
|
157
|
+
|
158
|
+
describe '#method_missing' do
|
159
|
+
it 'works only inside a block'
|
160
|
+
|
161
|
+
context 'String as argument' do
|
162
|
+
it 'queries the page '
|
163
|
+
end
|
164
|
+
|
165
|
+
context 'String and an Array as arguments' do
|
166
|
+
it 'queries the page '
|
167
|
+
it 'applies the operations '
|
168
|
+
end
|
169
|
+
|
170
|
+
context 'block' do
|
171
|
+
it 'creates '
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
describe '#respond_to?' do
|
176
|
+
it 'responds always with `true`' do
|
177
|
+
expect(subject.respond_to? :any).to be true
|
178
|
+
expect(subject.respond_to? 'any.method').to be true
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
xdescribe '#execute' do
|
183
|
+
|
184
|
+
context 'without block' do
|
185
|
+
it 'returns itself' do
|
186
|
+
expect(subject.execute).to be subject
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
context 'with block' do
|
191
|
+
it 'returns the scrapped data'
|
192
|
+
end
|
193
|
+
|
194
|
+
context 'block with parameters' do
|
195
|
+
it 'yields the page' do
|
196
|
+
expect {|b| subject.execute &b }.to yield_with_args page_double
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
end
|
201
|
+
|
202
|
+
describe '#validate' do
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|