scapeshift 1.0.1rg0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ require 'scapeshift/crawlers'
2
+ require 'scapeshift/card'
3
+ require 'scapeshift/errors'
4
+
5
+ module Scapeshift
6
+
7
+ ##
8
+ # The main Crawler class, which handles the routing of commands
9
+ # to the specific {Crawlers}. This is the main class that end-users
10
+ # should be interacting with.
11
+ #
12
+ # @example Scraping the Sets
13
+ # @sets = Scapeshift::Crawler.crawl :meta, :type => :sets
14
+ #
15
+ # @example Scraping all the cards from the Shards of Alara block
16
+ # @cards = Scapeshift::Crawler.crawl :cards, :set => 'Shards of Alara'
17
+ #
18
+ # @see Scapeshift::Crawlers::Meta
19
+ # @see Scapeshift::Crawlers::Cards
20
+ # @see Scapeshift::Crawlers::Single
21
+ #
22
+ # @author Josh Lindsey
23
+ #
24
+ # @since 0.1.0
25
+ #
26
+ class Crawler
27
+
28
+ ##
29
+ # The primary mode of interaction with the gem. Issues
30
+ # scaping commands to the specific {Crawlers}.
31
+ #
32
+ # @param [Symbol] type The type of crawl operation to perform
33
+ # @param [Hash] opts Options to pass to Crawlers that support them.
34
+ # See the classes in {Scapeshift::Crawlers} for a list of options.
35
+ #
36
+ # @yield [Scapeshift::Crawlers::Base] The instantiated specified crawler.
37
+ #
38
+ # @return [Object] See the various {Crawlers} for return types on their crawl methods.
39
+ #
40
+ # @raise [Scapeshift::Errors::InvalidCrawlerType] If an unrecognized crawler type is specified
41
+ #
42
+ # @author Josh Lindsey
43
+ #
44
+ # @since 0.1.0
45
+ #
46
+ def self.crawl type, opts = {}, &block
47
+ crawler = nil
48
+
49
+ case type
50
+ when :meta
51
+ crawler = Scapeshift::Crawlers::Meta.new opts
52
+ when :cards
53
+ crawler = Scapeshift::Crawlers::Cards.new opts
54
+ when :single
55
+ crawler = Scapeshift::Crawlers::Single.new opts
56
+ else
57
+ raise Scapeshift::Errors::InvalidCrawlerType.new "Invalid crawler type '#{type}'"
58
+ end
59
+
60
+ yield crawler if block_given?
61
+ crawler.crawl
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,20 @@
1
+ require 'scapeshift/crawlers/base'
2
+ require 'scapeshift/crawlers/cards'
3
+ require 'scapeshift/crawlers/meta'
4
+ require 'scapeshift/crawlers/single'
5
+
6
+ module Scapeshift
7
+
8
+ ##
9
+ # Contains the different web scrapers. All classes
10
+ # in this module must implement a single public class
11
+ # method: crawl
12
+ #
13
+ # @todo Add callback methods to each Crawler (eg. Sets.each_card &block)
14
+ #
15
+ # @author Josh Lindsey
16
+ #
17
+ # @since 0.1.0
18
+ module Crawlers; end
19
+
20
+ end
@@ -0,0 +1,107 @@
1
+ module Scapeshift
2
+ module Crawlers
3
+
4
+ ##
5
+ # Base crawler class that all other crawlers should extend.
6
+ #
7
+ # @author Josh Lindsey
8
+ #
9
+ # @since 0.3.0
10
+ #
11
+ # @abstract
12
+ #
13
+ class Base
14
+ ## Hash of callback blocks
15
+ @@callbacks = {}
16
+
17
+ ## @return [Hash] Options hash. Keys will differ between crawlers.
18
+ attr_accessor :options
19
+
20
+ ##
21
+ # Returns a new instance of a Crawler.
22
+ #
23
+ # @param [Hash] opts The options hash. Keys will differ between crawlers.
24
+ #
25
+ # @return [Crawlers::Base] The Crawler instance
26
+ #
27
+ # @raise [Scapeshift::Errors::InsufficientOptions] If the opts hash is empty
28
+ #
29
+ # @author Josh Lindsey
30
+ #
31
+ # @since 0.3.0
32
+ #
33
+ def initialize(opts = {})
34
+ if opts.empty?
35
+ raise Scapeshift::Errors::InsufficientOptions.new "The options hash must not be null"
36
+ end
37
+
38
+ self.options = opts
39
+ end
40
+
41
+ ##
42
+ # Abstract required method for each subclass.
43
+ #
44
+ # @raise [Scapeshift::Errors::InvalidSubclass] If any subclass fails to implement this method
45
+ #
46
+ # @author Josh Lindsey
47
+ #
48
+ # @since 0.3.0
49
+ #
50
+ def crawl
51
+ raise Scapeshift::Errors::InvalidSubclass.new "Subclasses of Crawlers::Base must implement #crawl"
52
+ end
53
+
54
+ ##
55
+ # Calls every Proc in {@@callbacks} for the specified symbol,
56
+ # yielding any objects passed in as args.
57
+ #
58
+ # @param [Symbol] symbol The symbol for the hook to call
59
+ # @param [Object] *args The splatted list of objects to yield to each Proc
60
+ #
61
+ # @author Josh Lindsey
62
+ #
63
+ # @since 0.3.0
64
+ #
65
+ def hook symbol, *args
66
+ @@callbacks[symbol].each { |p| p.call *args }
67
+ end
68
+
69
+ ##
70
+ # Adds the named callback hook to the class. Calling this method
71
+ # adds another new method to the class with the same name as the
72
+ # symbol passed in. This new method accepts a block, converts it
73
+ # to a Proc object, and pushes it onto {@@callbacks} for that symbol.
74
+ # It can be called using {#hook} and passing in the same symbol.
75
+ #
76
+ # @example Add a callback hook named "before_foo" to a class
77
+ # class Test < Crawlers::Base
78
+ # has_callback_hook :before_foo
79
+ #
80
+ # def foo
81
+ # str = "Hello, world!"
82
+ # self.hook :before_foo, str
83
+ # puts str
84
+ # end
85
+ # end
86
+ #
87
+ # test = Test.new
88
+ # test.before_foo { |str| str.replace "Baz" }
89
+ # test.foo # => Baz
90
+ #
91
+ # @author Josh Lindsey
92
+ #
93
+ # @since 0.3.0
94
+ #
95
+ def self.has_callback_hook symbol
96
+ @@callbacks[symbol] = []
97
+
98
+ self.class_eval %Q{
99
+ def #{symbol} &block
100
+ @@callbacks[:#{symbol}] << Proc.new(&block)
101
+ end
102
+ }
103
+ end
104
+ end
105
+ end
106
+ end
107
+
@@ -0,0 +1,331 @@
1
+ require 'set'
2
+ require 'uri'
3
+ require 'nokogiri'
4
+ require 'open-uri'
5
+
6
+ module Scapeshift
7
+ module Crawlers
8
+
9
+ ##
10
+ # The Card crawler scrapes Card data from the Oracle textual
11
+ # spoiler pages. Like the other Crawlers, it overrides the {#crawl}
12
+ # method inherited from {Base}.
13
+ #
14
+ # @example Directly instantiating the crawler
15
+ # crawler = Scapeshift::Crawlers::Cards.new :set => 'Shards of Alara'
16
+ # @cards = crawler.crawl
17
+ #
18
+ # @author Josh Lindsey
19
+ #
20
+ # @since 0.1.0
21
+ #
22
+ class Cards < Base
23
+ has_callback_hook :before_scrape
24
+ has_callback_hook :after_scrape
25
+ has_callback_hook :every_card
26
+
27
+ ## The Base URI we grab from. Interpolated based on options passed-in.
28
+ Text_Spoiler_URI = 'http://gatherer.wizards.com/Pages/Search/Default.aspx?output=spoiler&method=text%s'
29
+
30
+ ## The search fragment if we're searching on Blocks.
31
+ Block_Search_Frag = '&format=["%s"]'
32
+
33
+ ## The search fragment if we're searching on Sets.
34
+ Set_Search_Frag = '&set=["%s"]'
35
+
36
+ ## The search fragment if we're searching on Formats.
37
+ Format_Search_Frag = '&format=["%s"]'
38
+
39
+ ## @return [Nokogiri::HTML::Document] The Nokogiri document this instance is scraping
40
+ attr_reader :doc
41
+
42
+ ## @return [SortedSet <Scapeshift::Card>] The SortedSet of {Card} objects being built
43
+ attr_reader :cards
44
+
45
+ ## @return [Scapeshift::Card] The {Card} currently being built
46
+ attr_reader :current_card
47
+
48
+ ##
49
+ # Creates a new Cards crawler object.
50
+ #
51
+ # @param [Hash] opts The options to determine what to set. One of these *must* be set.
52
+ # @option opts [String] :set ('') The set to scrape (eg. "Darksteel")
53
+ # @option opts [String] :block ('') The block to scrape (eg. "Lorwyn block")
54
+ # @option opts [String] :format ('') The format to scrape (eg. "Legacy")
55
+ #
56
+ # @return [Scapeshift::Crawlers::Cards] The new Cards crawler
57
+ #
58
+ # @raise [Scapeshift::Errors::InsufficientOptions] If at least one of the options aren't set
59
+ #
60
+ # @author Josh Lindsey
61
+ #
62
+ # @since 0.3.0
63
+ #
64
+ def initialize(opts = {})
65
+ super opts
66
+
67
+ @cards = SortedSet.new
68
+
69
+ if self.options[:set].nil? and self.options[:block].nil? and self.options[:format].nil?
70
+ raise Scapeshift::Errors::InsufficientOptions.new "This crawler MUST be passed one of :set, :block, or :format."
71
+ end
72
+ end
73
+
74
+ ##
75
+ # Scrapes the Oracle Text Spoiler page for the specified set or block.
76
+ # Overridden from {Base#crawl}.
77
+ #
78
+ # @return [SortedSet <Card>] A Set containing the {Card} objects we've scraped
79
+ #
80
+ # @author Josh Lindsey
81
+ #
82
+ # @since 0.1.0
83
+ #
84
+ def crawl
85
+ search_frag = ''
86
+ unless self.options[:block].nil?
87
+ search_frag << Block_Search_Frag % self.options[:block]
88
+ end
89
+ unless self.options[:set].nil?
90
+ search_frag << Set_Search_Frag % self.options[:set]
91
+ end
92
+ unless self.options[:format].nil?
93
+ search_frag << Format_Search_Frag % self.options[:format]
94
+ end
95
+
96
+ @doc = Nokogiri::HTML open(URI.escape(Text_Spoiler_URI % search_frag))
97
+ rows = doc.xpath('//div[@class="textspoiler"]/table/tr')
98
+
99
+ self.hook :before_scrape, self.doc
100
+
101
+ @current_card = nil
102
+
103
+ rows.each do |row|
104
+ # The row between cards is just a `<td><br /></td>`,
105
+ # so when we see that we know the last card is finished.
106
+ if row.children.length == 2
107
+ self.hook :every_card, self.current_card
108
+ @cards << @current_card
109
+ @current_card = nil
110
+ next
111
+ end
112
+
113
+ @current_card = Scapeshift::Card.new if @current_card.nil?
114
+ _parse_row row
115
+ end
116
+
117
+ self.hook :after_scrape, self.cards
118
+
119
+ @cards
120
+ end
121
+
122
+ private
123
+
124
+ ##
125
+ # Primary "router" method that's called on every iteration of the main
126
+ # {#crawl} loop. Passes the current row to the specialized parser methods.
127
+ #
128
+ # @param [Nokogiri::XML::NodeSet] row The Nokogiri NodeSet comprising the table row to parse
129
+ #
130
+ # @raise [Scapeshift::Errors::UnknownCardAttribute] Raised if this method encounters a card attr
131
+ # it doesn't recognize
132
+ #
133
+ # @author Josh Lindsey
134
+ #
135
+ # @since 0.1.0
136
+ #
137
+ def _parse_row row
138
+ case _row_type(row)
139
+ when :name
140
+ @current_card.name = _parse_name row
141
+ @current_card.multiverse_id = _parse_image_uri row
142
+ @current_card.image_uri_from_id = @current_card.multiverse_id
143
+ when :cost
144
+ @current_card.cost = _parse_cost row
145
+ when :type
146
+ @current_card.types = _parse_type row
147
+ when :'pow/tgh'
148
+ @current_card.pow_tgh = _parse_pow_tgh row
149
+ when :'rules text'
150
+ @current_card.text = _parse_rules_text row
151
+ when :'set/rarity'
152
+ @current_card.sets = _parse_set_rarity row
153
+ when :loyalty
154
+ @current_card.loyalty = _parse_loyalty row
155
+ else
156
+ raise Scapeshift::Errors::UnknownCardAttribute.new "Unable to parse attribute: '#{_row_type(row)}'"
157
+ end
158
+ end
159
+
160
+ ##
161
+ # Determines which Card attribute this row contains.
162
+ #
163
+ # @param [Nokogiri::XML::NodeSet] row The Nokogiri NodeSet comprising the table row to parse
164
+ #
165
+ # @return [Symbol] The data this row contains
166
+ #
167
+ # @author Josh Lindsey
168
+ #
169
+ # @since 0.1.0
170
+ #
171
+ def _row_type row
172
+ row./('td[1]').children.first.to_s.strip.chop.downcase.to_sym
173
+ end
174
+
175
+ ##
176
+ # Parses the Card name out of the appropriate row.
177
+ #
178
+ # @param [Nokogiri::XML::NodeSet] row The NodeSet containing the name data
179
+ #
180
+ # @return [String] The Card name
181
+ #
182
+ # @author Josh Lindsey
183
+ #
184
+ # @since 0.1.0
185
+ #
186
+ def _parse_name row
187
+ row./('td[2]/a').children.last.to_s
188
+ end
189
+
190
+ ##
191
+ # Parses the "multiverse id" (Wizards' internal card ID, I guess) out of
192
+ # the card detail link. This can be used then to build the URI to the
193
+ # card image.
194
+ #
195
+ # @param [Nokogiri::XML::NodeSet] row The NodeSet containing the link
196
+ #
197
+ # @return [String] The Card's "multiverse id"
198
+ #
199
+ # @author Josh Lindsey
200
+ #
201
+ # @since 0.1.0
202
+ #
203
+ def _parse_image_uri row
204
+ row./('td[2]/a').first[:href] =~ /multiverseid=(\d+)/
205
+ $1
206
+ end
207
+
208
+ ##
209
+ # Parses the mana cost of the card.
210
+ #
211
+ # @param [Nokogiri::XML::NodeSet] row The NodeSet containing the cost
212
+ #
213
+ # @return [String] A string that should look something like "2BR"
214
+ # (for a cost of 2 colorless mana, one black mana, and one red mana).
215
+ #
216
+ # @author Josh Lindsey
217
+ #
218
+ # @since 0.1.0
219
+ #
220
+ def _parse_cost row
221
+ row./('td[2]').children.last.to_s.strip
222
+ end
223
+
224
+ ##
225
+ # Parses the card types of the card. Simply passes the extracted String
226
+ # to the Card object. It handles the rest.
227
+ #
228
+ # @see Card#types=
229
+ #
230
+ # @param [Nokogiri::XML::NodeSet] row The NodeSet containing the types
231
+ #
232
+ # @return [String] The card type line
233
+ #
234
+ # @author Josh Lindsey
235
+ #
236
+ # @since 0.1.0
237
+ #
238
+ def _parse_type row
239
+ row./('td[2]').children.first.to_s.strip
240
+ end
241
+
242
+ ##
243
+ # Parses the Power and Toughness of creature cards.
244
+ #
245
+ # @param [Nokogiri::XML::NodeSet] row The NodeSet containing the power and toughness ratings
246
+ #
247
+ # @return [Array [Power, Toughness]] The array containing the power and toughness for creature cards
248
+ # @return [nil] nil for non-creature-cards
249
+ #
250
+ # @author Josh Lindsey
251
+ #
252
+ # @since 0.1.0
253
+ #
254
+ def _parse_pow_tgh row
255
+ pt_str = row./('td[2]').children.first.to_s.strip
256
+ return nil if pt_str.empty?
257
+ pt_str =~ /\((.*?)\/(.*?)\)/
258
+ [$1, $2]
259
+ end
260
+
261
+ ##
262
+ # Parses the actual body text of the card.
263
+ #
264
+ # @param [Nokogiri::XML::NodeSet] row The NodeSet containing the rules text
265
+ #
266
+ # @return [String] The parsed body text, with <br /> tags converted into \n
267
+ #
268
+ # @author Josh Lindsey
269
+ #
270
+ # @since 0.1.0
271
+ #
272
+ def _parse_rules_text row
273
+ text = ''
274
+ row./('td[2]').children.each do |block|
275
+ if block.name == 'br'
276
+ text << "\n"
277
+ next
278
+ end
279
+
280
+ text << block.to_s.strip
281
+ end
282
+
283
+ text
284
+ end
285
+
286
+ ##
287
+ # Parses the sets and rarities the card was printed in.
288
+ #
289
+ # @param [Nokogiri::XML::NodeSet] row The NodeSet containing the rarities and sets
290
+ #
291
+ # @return [Array [[Set, Rarity]]] The sets and the rarity the card was in that set
292
+ #
293
+ # @author Josh Lindsey
294
+ #
295
+ # @since 0.1.0
296
+ #
297
+ def _parse_set_rarity row
298
+ ret_ary = []
299
+ sets_ary = row./('td[2]').children.first.to_s.strip.split(', ')
300
+ sets_ary.each do |set_str|
301
+ if set_str.include? 'Mythic Rare'
302
+ set_str.sub! ' Mythic Rare', ''
303
+ ret_ary << [set_str, 'Mythic Rare']
304
+ next
305
+ end
306
+
307
+ tmp = set_str.split(' ')
308
+ ret_ary << [tmp.pop, tmp.join(' ')].reverse
309
+ end
310
+
311
+ ret_ary
312
+ end
313
+
314
+ ##
315
+ # Parses the loyalty of a planeswalker card.
316
+ #
317
+ # @param [Nokogiri::XML::NodeSet] row The NodeSet containing the loyalty
318
+ #
319
+ # @return [String] The loyalty of the planeswalker card
320
+ #
321
+ # @author Eric Cohen
322
+ #
323
+ # @since 1.0.1
324
+ #
325
+ def _parse_loyalty row
326
+ row./('td[2]').children.first.to_s.strip[1..-2]
327
+ end
328
+ end
329
+ end
330
+ end
331
+