scrapula 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +3 -0
  3. data/.rspec +1 -0
  4. data/.simplecov +1 -0
  5. data/CHANGELOG.md +15 -0
  6. data/CONTRIBUTING.md +0 -0
  7. data/Gemfile +24 -0
  8. data/Gemfile.lock +127 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE +21 -0
  11. data/README.md +108 -0
  12. data/ROADMAP.md +42 -0
  13. data/Rakefile +30 -0
  14. data/examples/block_syntax.rb +20 -0
  15. data/examples/find_nodes.rb +6 -0
  16. data/examples/get_first_and_scrape_later.rb +13 -0
  17. data/examples/metas.rb +32 -0
  18. data/examples/more_api.rb +17 -0
  19. data/examples/nested_results.rb +14 -0
  20. data/examples/one_liners.rb +9 -0
  21. data/examples/posting_data.rb +7 -0
  22. data/examples/s.rb +24 -0
  23. data/examples/validation.rb +40 -0
  24. data/lib/scrapula.rb +47 -0
  25. data/lib/scrapula/_old_scraper.rb +110 -0
  26. data/lib/scrapula/agent.rb +8 -0
  27. data/lib/scrapula/data.rb +18 -0
  28. data/lib/scrapula/page.rb +109 -0
  29. data/lib/scrapula/page/meta.rb +74 -0
  30. data/lib/scrapula/request.rb +44 -0
  31. data/lib/scrapula/s.rb +21 -0
  32. data/lib/scrapula/scraper.rb +56 -0
  33. data/lib/scrapula/version.rb +3 -0
  34. data/scrapula.gemspec +36 -0
  35. data/spec/cassettes/Scrapula_Page_Meta/_.yml +748 -0
  36. data/spec/cassettes/Scrapula_Page_Meta/_/Open_Graph.yml +322 -0
  37. data/spec/cassettes/Scrapula_Page_Meta/_/other_names.yml +586 -0
  38. data/spec/cassettes/Scrapula_Page_Meta/_/standard_names.yml +429 -0
  39. data/spec/lib/scrapula/agent_spec.rb +6 -0
  40. data/spec/lib/scrapula/data_spec.rb +19 -0
  41. data/spec/lib/scrapula/page/meta_spec.rb +89 -0
  42. data/spec/lib/scrapula/page_spec.rb +136 -0
  43. data/spec/lib/scrapula/request_spec.rb +91 -0
  44. data/spec/lib/scrapula/s_spec.rb +44 -0
  45. data/spec/lib/scrapula/scraper_spec.rb +205 -0
  46. data/spec/lib/scrapula_spec.rb +141 -0
  47. data/spec/spec_helper.rb +26 -0
  48. metadata +118 -0
@@ -0,0 +1,89 @@
1
+ describe Scrapula::Page::Meta do
2
+
3
+ let(:page) { Scrapula::Request.new(url: url, method: :get).execute }
4
+ subject { described_class.new page }
5
+
6
+ describe '#initialize' do
7
+ end
8
+
9
+ describe '#[]' do
10
+ use_vcr_cassette
11
+
12
+ let(:url) { 'ogp.me' }
13
+
14
+ it 'symbols'
15
+
16
+ it 'returns nil when does not include a meta' do
17
+ expect(subject['yeah']).to eq nil
18
+ end
19
+
20
+ it '"charset" meta' do
21
+ expect(subject['charset']).to eq 'utf-8'
22
+ end
23
+
24
+ describe 'pragma directives' do
25
+ end
26
+
27
+ describe 'standard names' do
28
+ use_vcr_cassette
29
+
30
+ # FIXME other URL
31
+ let(:url) { 'http://ruby-doc.org/' }
32
+
33
+ it '"application-name" meta' do
34
+ expect(subject['application-name']).to eq 'Webby'
35
+ end
36
+
37
+ it '"author" meta' do
38
+ expect(subject['author']).to eq 'Webby'
39
+ end
40
+
41
+ it '"generator" meta' do
42
+ expect(subject['generator']).to eq 'Webby'
43
+ end
44
+
45
+ it '"description" meta' do
46
+ expect(subject['description']).to eq 'Fast and searchable Ruby documentation for core, standard libraries, and rubygems. Available in both RDoc and Yard format. Plus, links to tutorials, guides, books, and related sites.'
47
+ end
48
+
49
+ it '"keywords" meta' do
50
+ expect(subject['keywords']).to eq 'Webby'
51
+ end
52
+ end
53
+
54
+ describe 'Open Graph' do
55
+ use_vcr_cassette
56
+
57
+ let(:url) { 'ogp.me' }
58
+
59
+ it '"og:title" meta' do
60
+ expect(subject['og:title']).to eq 'Open Graph protocol'
61
+ end
62
+
63
+ it '"og:image:height" meta' do
64
+ expect(subject['og:image:height']).to eq '300'
65
+ end
66
+
67
+ context 'more than 1 property with the same name' do
68
+ # An array could be ambiguous
69
+ xit '"og:image:height" meta' do
70
+ expect(subject['og:image:height']).to eq '300'
71
+ end
72
+ end
73
+ end
74
+
75
+ describe 'other pragma directives' do
76
+ end
77
+
78
+ describe 'other names' do
79
+ use_vcr_cassette
80
+
81
+ let(:url) { 'https://dev.twitter.com/cards/markup' }
82
+
83
+ it '"twitter:title" meta' do
84
+ expect(subject['twitter:title']).to eq 'Cards Markup Tag Reference'
85
+ end
86
+ end
87
+ end
88
+
89
+ end
@@ -0,0 +1,136 @@
1
+ describe Scrapula::Page do
2
+
3
+ let(:query) { '.example' }
4
+
5
+ let(:agent_page_double) { instance_double Mechanize::Page }
6
+
7
+ subject { described_class.new agent_page_double }
8
+
9
+ describe '#initialize' do
10
+ it 'requires a Mechanize::Page'
11
+ end
12
+
13
+ describe '#scrape' do
14
+
15
+ let(:scraper_double) { instance_double Scrapula::Scraper }
16
+
17
+ let(:expect_new_scraper) {
18
+ allow(scraper_double).to receive(:data!)
19
+ expect(Scrapula::Scraper).to receive(:new).and_return scraper_double
20
+ }
21
+
22
+ it 'creates a scraper for the page' do
23
+ expect_new_scraper
24
+ subject.scrape & proc{}
25
+ end
26
+
27
+ it 'returns the scrapped data' do
28
+ expect_new_scraper
29
+ expect(scraper_double).to receive(:data!).and_return example: 'example value'
30
+
31
+ result = subject.scrape do
32
+ example '#example'
33
+ end
34
+
35
+ expect(result).to eq({ example: 'example value' })
36
+ end
37
+ end
38
+
39
+ describe '#meta!' do
40
+ it 'returns a Scrapula::Meta object' do
41
+ subject.meta!
42
+ end
43
+ end
44
+
45
+ # TODO
46
+ describe '#metas!' do
47
+ end
48
+
49
+ # TODO
50
+ describe '#search!' do
51
+ end
52
+
53
+ describe '#at!' do
54
+
55
+ let(:page_search_result_double) { instance_double Nokogiri::XML::NodeSet }
56
+
57
+ let(:expects_page_search) {
58
+ allow(agent_page_double).to receive(:at).and_return page_search_result_double
59
+ }
60
+
61
+ it 'requires a query' do
62
+ expect { subject.at! [] }.to raise_error
63
+ end
64
+
65
+ it 'performs the query' do
66
+ expects_page_search
67
+ expect(agent_page_double).to receive(:at)
68
+
69
+ subject.at! query
70
+ end
71
+
72
+ context 'without receiving an array with operations' do
73
+ it 'returns the query result' do
74
+ expects_page_search
75
+
76
+ expect(subject.at! query).to be page_search_result_double
77
+ end
78
+ end
79
+
80
+ context 'receiving an array with method names' do
81
+
82
+ let(:operations) { [:to_s, :upcase, :length] }
83
+ let(:example_string) { "You FOOL! This isn't even my final form!" }
84
+ let(:transformed_example) { example_string.upcase.length }
85
+
86
+ let(:expects_scraping_using_at) {
87
+ expects_page_search.with query
88
+ expect(page_search_result_double).to receive(:to_s).and_return example_string
89
+ }
90
+
91
+ context 'but not receiving a block' do
92
+ it 'applies them in order to the result' do
93
+ expects_scraping_using_at
94
+
95
+ expect(subject.at!(query, operations)).to eq transformed_example
96
+ end
97
+ end
98
+
99
+ context 'and a block' do
100
+ it 'applies them in order to the result and yields that value' do
101
+ expects_scraping_using_at
102
+
103
+ expect {|b| subject.at!(query, operations, &b) }.to yield_with_args transformed_example
104
+ end
105
+ end
106
+ end
107
+
108
+ end
109
+
110
+ describe '#html!' do
111
+
112
+ it 'performs the query' do
113
+ expect(subject).to receive(:at!).with query, any_args
114
+ subject.html! query
115
+ end
116
+
117
+ context 'without operations' do
118
+ it 'adds one for converting to HTML'
119
+ end
120
+
121
+ describe 'with operations' do
122
+ it 'prepends them for converting first to HTML'
123
+ end
124
+
125
+ describe 'block' do
126
+ end
127
+
128
+ end
129
+
130
+ describe '#text!' do
131
+ it 'performs the query'
132
+
133
+ it 'returns the inner text'
134
+ end
135
+
136
+ end
@@ -0,0 +1,91 @@
1
+ describe Scrapula::Request do
2
+
3
+ let(:url) { 'http://example.net' }
4
+ let(:data) { { url: url, method: 'get', params: [{ q: 'lol' }] } }
5
+
6
+ subject { described_class.new data }
7
+
8
+ describe '#initialize' do
9
+
10
+ describe 'requires an URL' do
11
+ %w[http https].each do |protocol|
12
+ context "receives one with the #{protocol.upcase} protocol" do
13
+ let(:url) { "#{protocol}://example.net" }
14
+
15
+ it 'maintains it' do
16
+ expect(described_class.new(url: url, method: :get).instance_variable_get :@url).to eq url
17
+ end
18
+ end
19
+ end
20
+
21
+ context 'receives one without protocol' do
22
+ let(:url) { 'example.net' }
23
+
24
+ it 'adds the HTTP one' do
25
+ expect(described_class.new(url: url, method: :get).instance_variable_get :@url).to eq "http://#{url}"
26
+ end
27
+ end
28
+ end
29
+
30
+ describe 'requires a valid HTTP method' do
31
+ end
32
+
33
+ describe 'can receive parameters' do
34
+ it 'as an array'
35
+ end
36
+ 'or not'
37
+
38
+ context 'receives a block' do
39
+ it "doesn't invoke it now"
40
+ end
41
+
42
+ end
43
+
44
+ describe '#execute' do
45
+
46
+ let(:agent_double) { instance_double Scrapula::Agent }
47
+
48
+ let(:page_double) { instance_double Scrapula::Page }
49
+
50
+ let(:expects_agent) {
51
+ expect(subject).to receive(:agent).and_return agent_double
52
+ }
53
+
54
+ # TODO for each method
55
+
56
+ it 'performs an HTTP request using the attributes' do
57
+ expects_agent
58
+
59
+ expect(agent_double).to receive(:get) do |*args|
60
+ expect(args[0]).to eq data[:url]
61
+ expect(args[1]).to eq data[:params]
62
+
63
+ # TODO other parameters
64
+ end
65
+
66
+ subject.execute
67
+ end
68
+
69
+ it 'returns the received page' do
70
+ expect(agent_double).to receive(:get).and_return page_double
71
+ expects_agent
72
+
73
+ subject.execute
74
+ end
75
+
76
+ end
77
+
78
+ # TODO agent
79
+ %w[url method params].each do |attribute|
80
+
81
+ describe "##{attribute}" do
82
+ it { is_expected.to respond_to attribute }
83
+
84
+ it "returns the established #{attribute} of the current request" do
85
+ expect(subject.__send__ attribute).to eq data[attribute.to_sym]
86
+ end
87
+ end
88
+
89
+ end
90
+
91
+ end
@@ -0,0 +1,44 @@
1
+ require_relative '../../../lib/scrapula/s'
2
+
3
+ describe 'S' do
4
+
5
+ describe '()' do
6
+ it 'implicit GET request' do
7
+ expect(Scrapula).to receive(:get)
8
+ S('url')
9
+ end
10
+
11
+ it 'returns'
12
+
13
+ context 'with block' do
14
+ end
15
+ end
16
+
17
+ Scrapula::HTTP_METHODS.each do |http_method|
18
+ describe ".#{http_method.to_s}" do
19
+ it "explicit #{http_method.to_s.upcase} request" do
20
+ expect(Scrapula).to receive(http_method)
21
+ S.__send__ http_method, 'url'
22
+ end
23
+
24
+ it 'returns'
25
+
26
+ context 'with block' do
27
+ end
28
+ end
29
+ end
30
+
31
+ shortcuts = { g: 'get' }
32
+
33
+ shortcuts.keys.each do |aka|
34
+ http_method = shortcuts[aka]
35
+
36
+ describe ".#{aka.to_s}" do
37
+ it "#{aka.to_s} performans a #{http_method.to_s.upcase} request" do
38
+ expect(Scrapula).to receive(http_method)
39
+ S.__send__ aka, 'url'
40
+ end
41
+ end
42
+ end
43
+
44
+ end
@@ -0,0 +1,205 @@
1
+ describe Scrapula::Scraper do
2
+
3
+ subject { described_class.new page_double }
4
+
5
+ let(:page_double) {
6
+ instance_double Scrapula::Page
7
+ }
8
+
9
+ # TODO better test titles
10
+ describe '#initialize' do
11
+
12
+ xit 'requires a page' do
13
+ end
14
+
15
+ context 'without block' do
16
+ end
17
+
18
+ context 'with block' do
19
+
20
+ let(:allow_example_query) {
21
+ allow(page_double).to receive(:txt!).with('#example').and_return 'example value'
22
+ }
23
+
24
+ let(:allow_out_block) {
25
+ allow(page_double).to receive(:at!)
26
+ }
27
+
28
+ describe 'without any query' do
29
+ it 'returns an empty hash' do
30
+ result = described_class.new page_double do
31
+ end
32
+ expect(result.data!).to eq({})
33
+ end
34
+ end
35
+
36
+ describe 'and inner attribute' do
37
+
38
+ # TODO how to know if it's a value or a query and corner cases (using attr! 'value' for assigning?)
39
+ xcontext 'value' do
40
+ it 'assigns the value to hash' do
41
+ result = described_class.new page_double do
42
+ example 'value'
43
+ end
44
+ expected = { example: 'value' }
45
+
46
+ expect(result.data!).to eq expected
47
+ end
48
+ end
49
+
50
+ context 'query' do
51
+
52
+ shared_examples :query do
53
+ before(:each) {
54
+ expect(page_double).to receive(:txt!).with('.example').and_return expected_value
55
+ }
56
+
57
+ it 'extracts the text' do
58
+ described_class.new page_double do
59
+ example '.example'
60
+ end
61
+ end
62
+
63
+ it 'assigns the result to a hash' do
64
+ result = described_class.new page_double do
65
+ example '.example'
66
+ end
67
+ expected = { example: expected_value }
68
+
69
+ expect(result.data!).to eq expected
70
+ end
71
+ end
72
+
73
+ context "that doesn't find anything" do
74
+ it "doesn't crash"
75
+ it 'returns nil'
76
+ end
77
+
78
+ context 'that returns only one element' do
79
+ let(:expected_value) { 'example value' }
80
+
81
+ include_examples :query
82
+ end
83
+
84
+ # TODO
85
+ xcontext 'that returns several elements' do
86
+ let(:expected_value) { %w[value1 value2 value3] }
87
+
88
+ include_examples :query
89
+ end
90
+ end
91
+
92
+ # TODO how to know if it's a value or a query and corner cases (using attr? for returning values)
93
+ xdescribe 'attributes can be invoked' do
94
+ it 'returns its previous value' do
95
+ allow_example_query
96
+
97
+ result = described_class.new page_double do
98
+ example '#example'
99
+ other example
100
+ end
101
+ expected = { example: 'example value', other: 'example value' }
102
+
103
+ expect(result.data!).to eq expected
104
+ end
105
+ end
106
+ end
107
+
108
+ # TODO how should behaves? ignore the query?
109
+ xdescribe 'with query and inner query' do
110
+
111
+ it 'assigns the query result to a hash' do
112
+ allow_out_block.with '#outer', '#out'
113
+ allow_example_query
114
+
115
+ result = described_class.new page_double do
116
+ outer '#outer' do
117
+ out '#out' do
118
+ example '#example'
119
+ end
120
+ end
121
+ end
122
+ expected = { outer: { out: { example: 'example value' } } }
123
+
124
+ expect(result.data!).to eq expected
125
+ end
126
+
127
+ end
128
+
129
+ describe 'without query and inner query' do
130
+
131
+ it 'assigns the query result to a hash' do
132
+ allow_out_block
133
+ allow_example_query
134
+
135
+ result = described_class.new page_double do
136
+ outer do
137
+ out do
138
+ example '#example'
139
+ end
140
+ end
141
+ end
142
+ expected = { outer: { out: { example: 'example value' } } }
143
+
144
+ expect(result.data!).to eq expected
145
+ end
146
+
147
+ end
148
+ end
149
+
150
+ context 'with block with parameters' do
151
+ it 'yields the page' do
152
+ expect {|b| described_class.new page_double, &b }.to yield_with_args page_double
153
+ end
154
+ end
155
+
156
+ end
157
+
158
+ describe '#method_missing' do
159
+ it 'works only inside a block'
160
+
161
+ context 'String as argument' do
162
+ it 'queries the page '
163
+ end
164
+
165
+ context 'String and an Array as arguments' do
166
+ it 'queries the page '
167
+ it 'applies the operations '
168
+ end
169
+
170
+ context 'block' do
171
+ it 'creates '
172
+ end
173
+ end
174
+
175
+ describe '#respond_to?' do
176
+ it 'responds always with `true`' do
177
+ expect(subject.respond_to? :any).to be true
178
+ expect(subject.respond_to? 'any.method').to be true
179
+ end
180
+ end
181
+
182
+ xdescribe '#execute' do
183
+
184
+ context 'without block' do
185
+ it 'returns itself' do
186
+ expect(subject.execute).to be subject
187
+ end
188
+ end
189
+
190
+ context 'with block' do
191
+ it 'returns the scrapped data'
192
+ end
193
+
194
+ context 'block with parameters' do
195
+ it 'yields the page' do
196
+ expect {|b| subject.execute &b }.to yield_with_args page_double
197
+ end
198
+ end
199
+
200
+ end
201
+
202
+ describe '#validate' do
203
+ end
204
+
205
+ end