scapeshift 1.0.1rg0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,136 @@
1
+ require 'set'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ module Scapeshift
6
+ module Crawlers
7
+
8
+ ##
9
+ # The Meta crawler scrapes meta data such as expansion sets and formats
10
+ # from the Oracle main search page. Like the other Crawlers, it overrides
11
+ # the {#crawl} method from {Base}.
12
+ #
13
+ # @example Directly instantiating the crawler
14
+ # crawler = Scapeshift::Crawlers::Meta.new :type => :sets
15
+ # @sets = crawler.crawl
16
+ #
17
+ # @author Josh Lindsey
18
+ #
19
+ # @since 0.1.4
20
+ #
21
+ class Meta < Base
22
+ has_callback_hook :before_scrape
23
+ has_callback_hook :after_scrape
24
+
25
+ ## @return [Nokogiri::HTML::Document] The Nokogiri document representing the page
26
+ attr_reader :doc
27
+
28
+ ## @return [SortedSet <String>] The SortedSet containing the scraped data
29
+ attr_reader :meta
30
+
31
+ ## The Oracle homepage, which is what we are scraping from
32
+ Meta_URI = 'http://gatherer.wizards.com/Pages/Default.aspx'
33
+
34
+ ##
35
+ # Creates a new Meta crawler instance.
36
+ #
37
+ # @param [Hash] opts Options for specifying the metadata to scrape
38
+ # @option opts [Symbol (:sets|:formats|:types)] :type ('') The type of metadata to scrape
39
+ #
40
+ # @return [Scapeshift::Crawlers::Meta] The Meta crawler object
41
+ #
42
+ # @raise [Scapeshift::Errors::InsufficientOptions] If :type isn't passed
43
+ #
44
+ # @author Josh Lindsey
45
+ #
46
+ # @since 0.3.0
47
+ #
48
+ def initialize(opts = {})
49
+ super opts
50
+
51
+ @meta = SortedSet.new
52
+
53
+ if self.options[:type].nil?
54
+ raise Scapeshift::Errors::InsufficientOptions.new "This crawler MUST be passed :type"
55
+ end
56
+ end
57
+
58
+ ##
59
+ # Scrapes the Oracle homepage for the specified data. Overridden from
60
+ # {Base#crawl}.
61
+ #
62
+ # @return [SortedSet <String>] A SortedSet containing the data
63
+ #
64
+ # @raise [Scapeshift::Errors::UnknownMetaType] If an unsupported metadata type is supplied
65
+ #
66
+ # @author Josh Lindsey
67
+ #
68
+ # @since 0.1.0
69
+ #
70
+ def crawl
71
+ @doc = Nokogiri::HTML open(Meta_URI)
72
+
73
+ self.hook :before_scrape, @doc
74
+
75
+ case @options[:type]
76
+ when :sets
77
+ _scrape_sets @doc
78
+ when :formats
79
+ _scrape_formats @doc
80
+ when :types
81
+ _scrape_types @doc
82
+ else
83
+ raise Scapeshift::Errors::UnknownMetaType.new "Unknown metadata type: '#{options[:type]}'"
84
+ end
85
+
86
+ self.hook :after_scrape, @meta
87
+
88
+ @meta
89
+ end
90
+
91
+ private
92
+
93
+ ##
94
+ # Scrapes the expansion set data from the document.
95
+ #
96
+ # @param [Nokogiri::HTML::Document] doc The full document of the Oracle page
97
+ #
98
+ # @author Josh Lindsey
99
+ #
100
+ # @since 0.1.4
101
+ #
102
+ def _scrape_sets doc
103
+ sets = doc.css 'select#ctl00_ctl00_MainContent_Content_SearchControls_setAddText'
104
+ sets.children.each { |set| @meta << set['value'] unless set['value'].empty? }
105
+ end
106
+
107
+ ##
108
+ # Scrapes the Format data from the document.
109
+ #
110
+ # @param [Nokogiri::HTML::Document] doc The full document of the Oracle page
111
+ #
112
+ # @author Josh Lindsey
113
+ #
114
+ # @since 0.1.4
115
+ #
116
+ def _scrape_formats doc
117
+ formats = doc.css 'select#ctl00_ctl00_MainContent_Content_SearchControls_formatAddText'
118
+ formats.children.each { |format| @meta << format['value'] }
119
+ end
120
+
121
+ ##
122
+ # Scrapes the card types data from the document.
123
+ #
124
+ # @param [Nokogiri::HTML::Document] doc The full document of the Oracle page
125
+ #
126
+ # @author Josh Lindsey
127
+ #
128
+ # @since 0.1.4
129
+ #
130
+ def _scrape_types doc
131
+ types = doc.css'select#ctl00_ctl00_MainContent_Content_SearchControls_typeAddText'
132
+ types.children.each { |type| @meta << type['value'] }
133
+ end
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,404 @@
1
+ require 'uri'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+
5
+ module Scapeshift
6
+ module Crawlers
7
+
8
+ ##
9
+ # Scrapes the Oracle card detail page for a single card. Like
10
+ # the other Crawlers, it overrides the {#crawl} method from {Base}.
11
+ #
12
+ # @example Directly instantiating the crawler
13
+ # crawler = Scapeshift::Crawlers::Single.new :name => 'Counterspell'
14
+ # @card = crawler.crawl
15
+ #
16
+ # @todo Add support for scraping Planechase Plane cards.
17
+ #
18
+ # @author Josh Lindsey
19
+ #
20
+ # @since 0.2.0
21
+ #
22
+ class Single < Base
23
+ has_callback_hook :before_scrape
24
+ has_callback_hook :after_scrape
25
+ has_callback_hook :every_attr
26
+
27
+ ## The details page for cards by multiverse id. Joined with a card's multiverse id.
28
+ Card_Multiverse_ID_Search_URI = 'http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid='
29
+
30
+ ## The base search page for card names. Joined to {Card_Name_Frag}.
31
+ Card_Name_Search_URI = 'http://gatherer.wizards.com/Pages/Search/Default.aspx?name='
32
+
33
+ ## The search fragment for each word in the name. Interpolated
34
+ ## with each word in the Card name.
35
+ Card_Name_Frag = '+[%s]'
36
+
37
+ ## @return [Scapeshift::Card] The {Card} object representing the scraped data
38
+ attr_reader :card
39
+
40
+ ## @return [Nokogiri::HTML::Document] The Nokogiri document representing the card detail page
41
+ attr_reader :doc
42
+
43
+ ##
44
+ # Creates a new Single crawler object.
45
+ #
46
+ # @param [Hash] opts Options hash
47
+ # @option opts [String] :name ('') The name of the card to scrape
48
+ #
49
+ # @return [Scapeshift::Crawlers::Single] The Single crawler object
50
+ #
51
+ # @raise [Scapeshift::Errors::InsufficientOptions] If :name isn't passed
52
+ #
53
+ # @author Josh Lindsey
54
+ #
55
+ # @since 0.3.0
56
+ #
57
+ def initialize(opts = {})
58
+ super opts
59
+
60
+ @card = Scapeshift::Card.new
61
+
62
+ if self.options[:name].nil? and self.options[:multiverse_id].nil?
63
+ raise Scapeshift::Errors::InsufficientOptions.new "This crawler MUST be passed one of :name or :multiverse_id"
64
+ end
65
+ end
66
+
67
+ ##
68
+ # Scrapes the Oracle card detail page for the specified card name.
69
+ # Overrides the {Base#crawl} method.
70
+ #
71
+ # @return [Scapeshift::Card] The Card containing the scraped data
72
+ #
73
+ # @raise [Scapeshift::Errors::CardNameAmbiguousOrNotFound]
74
+ # If instead of being redirected to the Card detail page, this crawler
75
+ # finds itself on a search results page.
76
+ #
77
+ # @author Josh Lindsey
78
+ #
79
+ # @since 0.2.0
80
+ #
81
+ def crawl
82
+ uri_str = if not self.options[:multiverse_id].nil?
83
+ Card_Multiverse_ID_Search_URI + self.options[:multiverse_id].to_s
84
+ elsif not self.options[:name].nil?
85
+ self.options[:name].split(' ').inject(Card_Name_Search_URI) { |memo, word| memo + (Card_Name_Frag % word) }
86
+ end
87
+
88
+ @doc = Nokogiri::HTML open(URI.escape uri_str)
89
+
90
+ self.hook :before_scrape, @doc
91
+
92
+ # Check to make sure we're actually on the card detail page.
93
+ unless doc.css('div.filterList').empty?
94
+ raise Scapeshift::Errors::CardNameAmbiguousOrNotFound.new "Unable to find card: '#{options[:name]}'"
95
+ end
96
+
97
+ @card.name = _parse_name doc
98
+ self.hook :every_attr, @card
99
+
100
+ @card.cost = _parse_cost doc
101
+ self.hook :every_attr, @card
102
+
103
+ @card.types = _parse_types doc
104
+ self.hook :every_attr, @card
105
+
106
+ @card.text = _parse_text doc
107
+ self.hook :every_attr, @card
108
+
109
+ @card.flavour_text = _parse_flavour_text doc
110
+ self.hook :every_attr, @card
111
+
112
+ @card.sets = _parse_sets doc
113
+ self.hook :every_attr, @card
114
+
115
+ @card.pow_tgh = _parse_pow_tgh doc
116
+ self.hook :every_attr, @card
117
+
118
+ @card.loyalty = _parse_loyalty doc
119
+ self.hook :every_attr, @card
120
+
121
+ @card.artist = _parse_artist doc
122
+ self.hook :every_attr, @card
123
+
124
+ @card.multiverse_id = _parse_multiverse_id doc
125
+ self.hook :every_attr, @card
126
+
127
+ @card.image_uri_from_id = @card.multiverse_id
128
+ self.hook :every_attr, @card
129
+
130
+ @card.number = _parse_number doc
131
+ self.hook :every_attr, @card
132
+
133
+ self.hook :after_scrape, @card
134
+
135
+ @card
136
+ end
137
+
138
+ private
139
+
140
+ ##
141
+ # Scrape the card name from the detail page.
142
+ #
143
+ # @param [Nokogiri::HTML::Document] doc The detail page document
144
+ #
145
+ # @return [String] The card's name
146
+ #
147
+ # @author Josh Lindsey
148
+ #
149
+ # @since 0.2.0
150
+ #
151
+ def _parse_name doc
152
+ doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow/div[2]').
153
+ children.first.to_s.strip
154
+ end
155
+
156
+ ##
157
+ # Scrape the card's mana cost from the detail page.
158
+ #
159
+ # @param [Nokogiri::HTML::Document] doc The detail page document
160
+ #
161
+ # @return [String] The formatted string representation of the card's cost.
162
+ # (eg. "2BU")
163
+ #
164
+ # @see Scapeshift::Card.cost_symbol_from_str
165
+ #
166
+ # @author Josh Lindsey
167
+ #
168
+ # @since 0.2.0
169
+ #
170
+ def _parse_cost doc
171
+ str = ''
172
+ costs = doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow/div[2]/img')
173
+ costs.each { |cost| str << Scapeshift::Card.cost_symbol_from_str(cost['alt']) }
174
+ str
175
+ end
176
+
177
+ ##
178
+ # Scrape the card's types from the detail page.
179
+ #
180
+ # @param [Nokogiri::HTML::Document] doc The detail page document
181
+ #
182
+ # @return [String] The types line string
183
+ #
184
+ # @see Scapeshift::Card#types=
185
+ #
186
+ # @author Josh Lindsey
187
+ #
188
+ # @since 0.2.0
189
+ #
190
+ def _parse_types doc
191
+ doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow/div[2]').
192
+ children.first.to_s.strip
193
+ end
194
+
195
+ ##
196
+ # Scrape the card's rules text from the detail page.
197
+ #
198
+ # @param [Nokogiri::HTML::Document] doc The detail page document
199
+ #
200
+ # @return [String] The rules text
201
+ #
202
+ # @see #_recursive_parse_text
203
+ #
204
+ # @author Josh Lindsey
205
+ #
206
+ # @since 0.2.0
207
+ #
208
+ def _parse_flavour_text doc
209
+ flavour_text = ''
210
+ blocks = doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_FlavorText/div[@class=cardtextbox]')
211
+ _recursive_parse_text blocks, 0, nil, flavour_text
212
+ flavour_text.strip
213
+ end
214
+
215
+ ##
216
+ # Scrape the card's flavour text from the detail page.
217
+ #
218
+ # @param [Nokogiri::HTML::Document] doc The detail page document
219
+ #
220
+ # @return [String] The flavour text
221
+ #
222
+ # @see #_recursive_parse_text
223
+ #
224
+ # @author Eric Cohen
225
+ #
226
+ # @since 1.0.1
227
+ #
228
+ def _parse_text doc
229
+ text = ''
230
+ blocks = doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow/div[2]/div[@class=cardtextbox]')
231
+ _recursive_parse_text blocks, 0, nil, text
232
+ text.strip
233
+ end
234
+
235
+ ##
236
+ # Scrapes the printings (sets and rarities) of the card.
237
+ #
238
+ # @param [Nokogiri::HTML::Document] doc The detail page document
239
+ #
240
+ # @return [Array] The array of sets and rarities
241
+ #
242
+ # @author Josh Lindsey
243
+ #
244
+ # @since 0.2.0
245
+ #
246
+ def _parse_sets doc
247
+ regex = /^(.*?) \((.*?)\)$/
248
+ sets_ary = []
249
+
250
+ current = doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_setRow')./('img').first['title']
251
+ current =~ regex
252
+ sets_ary << [$1, $2]
253
+
254
+ others = doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_otherSetsRow')./('img')
255
+ others.each do |other|
256
+ other['title'] =~ regex
257
+ sets_ary << [$1, $2]
258
+ end
259
+
260
+ sets_ary
261
+ end
262
+
263
+ ##
264
+ # Scapes the card's Power and Toughness (if a creature card).
265
+ #
266
+ # @param [Nokogiri::HTML::Document] doc The detail page document
267
+ #
268
+ # @return [Array] The power and toughness
269
+ # @return [nil] If it's not a creature
270
+ #
271
+ # @author Josh Lindsey
272
+ #
273
+ # @since 0.2.0
274
+ #
275
+ def _parse_pow_tgh doc
276
+ pt_row = doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow')
277
+ return nil if pt_row.empty?
278
+
279
+ pt_str = pt_row./('div[2]').children.first.to_s.strip
280
+ pt_str =~ /^(.*?) \/ (.*?)$/
281
+ [$1, $2]
282
+ end
283
+
284
+ ##
285
+ # Scrapes the card's loyalty (if a planeswalker card).
286
+ #
287
+ # @param [Nokogiri::HTML::Document] doc The detail page document
288
+ #
289
+ # @return [String] The card's loyalty
290
+ # @return [nil] If it's not a planeswalker
291
+ #
292
+ # @author Eric Cohen
293
+ #
294
+ # @since 1.0.1
295
+ #
296
+ def _parse_loyalty doc
297
+ loyalty_row = doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow')
298
+ return nil if loyalty_row.empty?
299
+
300
+ loyalty = loyalty_row./('div[2]').children.first.to_s.strip
301
+ loyalty =~ /^([0-9]*)$/
302
+ $1
303
+ end
304
+
305
+ ##
306
+ # Scrapes the name of the Artist of this card.
307
+ #
308
+ # @param [Nokogiri::HTML::Document] doc The detail page document
309
+ #
310
+ # @return [String] The card's Artist
311
+ #
312
+ # @author Eric Cohen
313
+ #
314
+ # @since 1.0.1
315
+ #
316
+ def _parse_artist doc
317
+ doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ArtistCredit/a').
318
+ children.first.to_s.strip
319
+ end
320
+
321
+ ##
322
+ # Scapes the multiverse ID of this card so the Card object can
323
+ # interpolate it into the image URI.
324
+ #
325
+ # @param [Nokogiri::HTML::Document] doc The detail page document
326
+ #
327
+ # @return [String] The mutliverse ID of this card
328
+ #
329
+ # @author Josh Lindsey
330
+ #
331
+ # @since 0.2.0
332
+ #
333
+ def _parse_multiverse_id doc
334
+ src = doc.css('img#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cardImage').first['src']
335
+ src =~ /multiverseid=(.*?)&/
336
+ $1
337
+ end
338
+
339
+ ##
340
+ # Scapes the card number of this card.
341
+ #
342
+ # @param [Nokogiri::HTML::Document] doc The detail page document
343
+ #
344
+ # @return [String] The mutliverse ID of this card
345
+ #
346
+ # @author Eric Cohen
347
+ #
348
+ # @since 1.0.1
349
+ #
350
+ def _parse_number doc
351
+ doc.css('div#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow .value').
352
+ children.first.to_s.strip
353
+ end
354
+
355
+ ##
356
+ # Recursively parse the detail page text, since it's contained within
357
+ # elements of its own. Also converts mana images to symbols. Called from
358
+ # {#_parse_text}.
359
+ #
360
+ # @param [Array] node_ary The array of nodes for the current recursion
361
+ # @param [Integer] pos The current position in the current node_ary
362
+ # @param [Symbol] last_element The last element traversed, used for formatting
363
+ # @param [String] text A pointer to the text string we're building
364
+ #
365
+ # @see #_parse_text
366
+ # @see Scapeshift::Card.cost_symbol_from_str
367
+ #
368
+ # @author Josh Lindsey
369
+ #
370
+ # @since 0.2.0
371
+ #
372
+ def _recursive_parse_text node_ary, pos, last_element, text
373
+ node = node_ary[pos]
374
+ return if node.nil?
375
+
376
+ # Text holder div
377
+ if node.is_a?(Nokogiri::XML::Element) and node['class'] == 'cardtextbox'
378
+ text << "\n"
379
+ _recursive_parse_text node.children, 0, :div, text
380
+
381
+ # Mana image
382
+ elsif node.is_a?(Nokogiri::XML::Element) and node.name == 'img'
383
+ text << ' ' unless last_element == :img
384
+ text << Scapeshift::Card.cost_symbol_from_str(node['alt'])
385
+ last_element = :img
386
+
387
+ # Keyword text
388
+ elsif node.is_a?(Nokogiri::XML::Element) and node.name == 'i'
389
+ text << ' ' if last_element == :img
390
+ _recursive_parse_text node.children, 0, :i, text
391
+
392
+ # Regular text
393
+ elsif node.is_a? Nokogiri::XML::Text
394
+ text << ' ' if last_element == :img
395
+ text << node.to_s.strip
396
+ last_element = :text
397
+ end
398
+
399
+ _recursive_parse_text node_ary, pos+1, last_element, text
400
+ end
401
+ end
402
+ end
403
+ end
404
+