taylorbarstow-nytimes-articles 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :minor: 2
3
+ :patch: 1
4
+ :major: 0
@@ -0,0 +1,6 @@
1
+ # should I be setting this?
2
+ $KCODE = 'UTF8'
3
+
4
+ %w(exceptions base facet thumbnail article result_set query).each do |f|
5
+ require File.join(File.dirname(__FILE__), 'nytimes_articles', f)
6
+ end
@@ -0,0 +1,462 @@
1
+ require 'rubygems'
2
+
3
+ module Nytimes
4
+ module Articles
5
+ ##
6
+ # The Article class represents a single article returned from the New York Times Article Search API. Note that an article can have many attributes
7
+ # but these are not necessarily populated unless you explicitly request them in the reply from the server via the <tt>:fields</tt> parameter to
8
+ # search (or use <tt>:fields => :all</tt>).
9
+ class Article < Base
10
+ RAW_FIELDS = %w(url)
11
+ TEXT_FIELDS = %w(abstract author body byline lead_paragraph nytd_lead_paragraph nytd_title title)
12
+ NUMERIC_FIELDS = %w(word_count)
13
+ BOOLEAN_FIELDS = %w(fee small_image)
14
+ IMAGE_FIELDS = %w(small_image small_image_url small_image_height small_image_width)
15
+ MULTIMEDIA_FIELDS = %w(multimedia related_multimedia)
16
+
17
+ ALL_FIELDS = TEXT_FIELDS + RAW_FIELDS + NUMERIC_FIELDS + BOOLEAN_FIELDS + MULTIMEDIA_FIELDS + Facet::ALL_FACETS + IMAGE_FIELDS
18
+
19
+ EARLIEST_BEGIN_DATE = '19810101'
20
+
21
+ attr_reader *ALL_FIELDS
22
+
23
+ # special additional objects
24
+ attr_reader :thumbnail
25
+
26
+ # Scalar facets
27
+ attr_reader :page, :column, :pub_month, :pub_year, :pub_day, :day_of_week, :desk, :date, :section_page, :source
28
+
29
+ # Facets that return multiple values
30
+ attr_reader :classifiers, :descriptions, :geo, :material_types, :organizations, :persons, :nytd_bylines, :nytd_descriptions, :nytd_geo, :nytd_organizations, :nytd_persons, :nytd_sections, :nytd_works_mentioned, :works_mentioned
31
+ alias :people :persons
32
+ alias :nytd_people :nytd_persons
33
+
34
+ ##
35
+ # Create a new Article from hash arguments. You really don't need to call this as Article instances are automatically returned from the API
36
+ def initialize(params={})
37
+ params.each_pair do |k,v|
38
+ instance_variable_set("@#{k}", v)
39
+ end
40
+ end
41
+
42
+ ##
43
+ # Is this article available for a fee?
44
+ alias :fee? :fee
45
+
46
+ ##
47
+ # Is this article available for free?
48
+ def free?
49
+ not(fee?)
50
+ end
51
+
52
+ ##
53
+ # Creates a new Article from the a hash returned from the API. This is called on search results. You have no reason to call it.
54
+ def self.init_from_api(params)
55
+ article = Article.new(
56
+ :abstract => text_field(params['abstract']),
57
+ :author => text_field(params['author']),
58
+ :body => text_field(params['body']),
59
+ :byline => text_field(params['byline']),
60
+ :fee => boolean_field(params['fee']),
61
+ :lead_paragraph => text_field(params['lead_paragraph']),
62
+ :nytd_title => text_field(params['nytd_title']),
63
+ :nytd_lead_paragraph => text_field(params['nytd_lead_paragraph']),
64
+ :related_multimedia => nil, # FIXME
65
+ :thumbnail => Thumbnail.init_from_api(params),
66
+ :title => text_field(params['title']),
67
+ :url => params['url'],
68
+ :word_count => integer_field(params['word_count']),
69
+
70
+ # FACETS THAT RETURN SCALARS
71
+ :page => integer_field(params[Facet::PAGE]),
72
+ :column => text_field(params[Facet::COLUMN]),
73
+ :pub_month => integer_field(params[Facet::PUB_MONTH]),
74
+ :pub_year => integer_field(params[Facet::PUB_YEAR]),
75
+ :pub_day => integer_field(params[Facet::PUB_DAY]),
76
+ :day_of_week => params[Facet::DAY_OF_WEEK],
77
+ :desk => text_field(params[Facet::DESK]),
78
+ :date => date_field(params[Facet::DATE]),
79
+ :section_page => params[Facet::SECTION_PAGE],
80
+ :source => text_field(params[Facet::SOURCE]),
81
+
82
+ # FIXME! MORE FACET PARAMS
83
+ # FACETS THAT RETURN ARRAYS
84
+ :classifiers => facet_params(params, Facet::CLASSIFIERS),
85
+ :descriptions => facet_params(params, Facet::DESCRIPTION),
86
+ :geo => facet_params(params, Facet::GEO),
87
+ :material_types => facet_params(params, Facet::MATERIAL_TYPE),
88
+ :organizations => facet_params(params, Facet::ORGANIZATION),
89
+ :persons => facet_params(params, Facet::PERSON),
90
+ :nytd_bylines => facet_params(params, Facet::NYTD_BYLINE),
91
+ :nytd_descriptions => facet_params(params, Facet::NYTD_DESCRIPTION),
92
+ :nytd_geo => facet_params(params, Facet::NYTD_GEO),
93
+ :nytd_organizations => facet_params(params, Facet::NYTD_ORGANIZATION),
94
+ :nytd_persons => facet_params(params, Facet::NYTD_PERSON),
95
+ :nytd_sections => facet_params(params, Facet::NYTD_SECTION),
96
+ :nytd_works_mentioned => facet_params(params, Facet::NYTD_WORKS_MENTIONED),
97
+ :works_mentioned => facet_params(params, Facet::WORKS_MENTIONED)
98
+ )
99
+
100
+ article
101
+ end
102
+
103
+ ##
104
+ # Executes a search against the Article Search API and returns a ResultSet of 10 articles. At its simplest form, can be invoked
105
+ # with just a string like so
106
+ #
107
+ # Article.search 'dog food'
108
+ #
109
+ # which will do a text search against several text fields in the article and return the most basic fields for each
110
+ # article, but it takes a large number of potential parameters. All of these fields and then some can be returned as display fields
111
+ # in the articles retrieved from search (see the <tt>:fields</tt> argument below)
112
+ #
113
+ # == TEXT FIELDS
114
+ #
115
+ # If passed a string as the first argument, the text will be used to search against the title, byline and body fields of articles. This text takes
116
+ # the following boolean syntax:
117
+ # * <tt>dog food</tt> - similar to doing a boolean =AND search on both terms
118
+ # * <tt>"ice cream"</tt> - matches the words as a phrase in the text
119
+ # * <tt>ice -cream</tt> - to search text that doesn't contain a term, prefix with the minus sign.
120
+ #
121
+ # Should you wish to target text against specific text fields associated with the article, the following named parameters are supported:
122
+ # * <tt>:abstract</tt> - A summary of the article, written by Times indexers
123
+ # * <tt>:body</tt> - A portion of the beginning of the article. Note: Only a portion of the article body is included in responses. But when you search against the body field, you search the full text of the article.
124
+ # * <tt>:byline</tt> - The article byline, including the author's name
125
+ # * <tt>:lead_paragraph</tt> - The first paragraph of the article (as it appeared in the printed newspaper)
126
+ # * <tt>:nytd_byline</tt> - The article byline, formatted for NYTimes.com
127
+ # * <tt>:nytd_lead_paragraph</tt> - The first paragraph of the article (as it appears on NYTimes.com)
128
+ # * <tt>:nytd_title</tt> - The article title on NYTimes.com (this field may or may not match the title field; headlines may be shortened and edited for the Web)
129
+ # * <tt>:text</tt> - The text field consists of title + byline + body (combined in an OR search) and is the default field for keyword searches.
130
+ # * <tt>:title</tt> - The article title (headline); corresponds to the headline that appeared in the printed newspaper
131
+ # * <tt>:url</tt> - The URL of the article on NYTimes.com
132
+ #
133
+ # == FACET SEARCHING
134
+ #
135
+ # Beyond query searches, the NY Times API also allows you to search against controlled vocabulary metadata associated with the article. This is powerful, if you want precise matching against specific
136
+ # people, places, etc (eg, "I want stories about Ford the former president, not Ford the automative company"). The following Facet constants are supported.
137
+ #
138
+ # * <tt>Facet::CLASSIFIERS</tt> - Taxonomic classifiers that reflect Times content categories, such as _Top/News/Sports_
139
+ # * <tt>Facet::COLUMN</tt> - A Times column title (if applicable), such as _Weddings_ or _Ideas & Trends_
140
+ # * <tt>Facet::DATE</tt> - The publication date in YYYYMMDD format
141
+ # * <tt>Facet::DAY_OF_WEEK</tt> - The day of the week (e.g., Monday, Tuesday) the article was published (compare <tt>PUB_DAY</tt>, which is the numeric date rather than the day of the week)
142
+ # * <tt>Facet::DESCRIPTION</tt> - Descriptive subject terms assigned by Times indexers (must be in UPPERCASE)
143
+ # * <tt>Facet::DESK</tt> - The Times desk that produced the story (e.g., _Business/Financial Desk_)
144
+ # * <tt>Facet::GEO</tt> - Standardized names of geographic locations, assigned by Times indexers (must be in UPPERCASE)
145
+ # * <tt>Facet::MATERIAL_TYPE</tt> - The general article type, such as Biography, Editorial or Review
146
+ # * <tt>Facet::ORGANIZATION</tt> - Standardized names of people, assigned by Times indexers (must be UPPERCASE)
147
+ # * <tt>Facet::PAGE</tt> - The page the article appeared on (in the printed paper)
148
+ # * <tt>Facet::PERSON</tt> - Standardized names of people, assigned by Times indexers. When used in a request, values must be UPPERCASE.
149
+ # * <tt>Facet::PUB_DAY</tt> - The day (DD) segment of date, separated for use as facets
150
+ # * <tt>Facet::PUB_MONTH</tt> - The month (MM) segment of date, separated for use as facets
151
+ # * <tt>Facet::PUB_YEAR</tt> - The year (YYYY) segment of date, separated for use as facets
152
+ # * <tt>Facet::SECTION_PAGE</tt> - The full page number of the printed article (e.g., _D00002_)
153
+ # * <tt>Facet::SOURCE</tt> - The originating body (e.g., _AP_, _Dow Jones_, _The New York Times_)
154
+ # * <tt>Facet::WORKS_MENTIONED</tt> - Literary works mentioned in the article
155
+ # * <tt>Facet::NYTD_BYLINE</tt> - The article byline, formatted for NYTimes.com
156
+ # * <tt>Facet::NYTD_DESCRIPTION</tt> - Descriptive subject terms, assigned for use on NYTimes.com (to get standardized terms, use the TimesTags API). When used in a request, values must be Mixed Case
157
+ # * <tt>Facet::NYTD_GEO</tt> - Standardized names of geographic locations, assigned for use on NYTimes.com (to get standardized terms, use the TimesTags API). When used in a request, values must be Mixed Case
158
+ # * <tt>Facet::NYTD_ORGANIZATION</tt> - Standardized names of organizations, assigned for use on NYTimes.com (to get standardized terms, use the TimesTags API). When used in a request, values must be Mixed Case
159
+ # * <tt>Facet::NYTD_PERSON</tt> - Standardized names of people, assigned for use on NYTimes.com (to get standardized terms, use the TimesTags API). When used in a request, values must be Mixed Case.
160
+ # * <tt>Facet::NYTD_SECTION</tt> - The section the article appears in (on NYTimes.com)
161
+ # * <tt>Facet::NYTD_WORKS_MENTIONED</tt> - Literary works mentioned (titles formatted for use on NYTimes.com)
162
+ #
163
+ # Note that for your convenience you can also search with symbol versions of the constants (<tt>:geo => ['MANHATTAN']</tt>). Even pluralization is supported. To get the string API version of the facet use Facet#symbol_name
164
+ #
165
+ # The following two search fields are used for facet searching:
166
+ # * <tt>:only_facets</tt> - takes a single value or array of facets to search. Facets can either be specified as array pairs (like <tt>[Facet::GEOGRAPHIC, 'CALIFORNIA']</tt>) or facets returned from a previous search can be passed directly. A single string can be passed as well if you have hand-crafted string.
167
+ # * <tt>:except_facets</tt> - similar to <tt>:only_facets</tt> but is used to specify a list of facets to exclude.
168
+ #
169
+ # == TIME SEARCHES
170
+ # * <tt>:begin_date</tt>, <tt>:end_date</tt> - the parameters are used to specify a start and end date for search results. BOTH of these must be provided or the API will return an error. Accepts either a Time/Date argument or a string of the format YYYYMMDD. For convenience the following alternative methods are provided
171
+ # * <tt>:before</tt> - an alternative to :end_date. Automatically adds a :before_date of sometime in 1980 if no :since argument is also provided.
172
+ # * <tt>:since</tt> - An alternative to :begin_date. Automatically adds an :end_date of Time.now if no :before argument is provided.
173
+ #
174
+ # == OTHER SEARCH FIELDS
175
+ # * <tt>:fee</tt> - if set to true, only returns articles that must be purchased. If false, returns only free articles. If not specified, returns all articles
176
+ # * <tt>:has_thumbnail</tt> - returns only articles that have thumbnail images associated. Note that to see the thumbnails, you must specify either <tt>:thumbnail</tt> or <tt>:all</tt> in the <tt>:fields</tt> argument).
177
+ # * <tt>:has_multimedia</tt> - to be implemented
178
+ #
179
+ # == FACET SUMMARIES
180
+ #
181
+ # The <tt>:facets</tt> argument can be used to specify up to 5 facet fields to be returned alongside the search that provide overall counts
182
+ # of how much each facet term appears in the search results. FIXME provide list of available facets as well as description of :nytd parameter.
183
+ #
184
+ # == ARTICLE FIELDS
185
+ #
186
+ # The <tt>:fields</tt> parameter is used to indicate what fields are returned with each article from the search results. If not specified, only
187
+ # the following fields are returned for each article: body, byline, date, title, and url. To return specific fields, any of the search fields
188
+ # from above can be explicitly specified in a comma-delimited list, as well as the additional display-only (not searchable) fields below (these
189
+ # are strings or symbols):
190
+ #
191
+ # * <tt>:all</tt> - return all fields for the article
192
+ # * <tt>:none</tt> - display only the facet breakdown and no article results
193
+ # * <tt>:multimedia</tt> - return any related multimedia links for the article
194
+ # * <tt>:thumbnail</tt> - return information for a related thumbnail image (if the article has one)
195
+ # * <tt>:word_count</tt> - the word_count of the article.
196
+ def self.search(query, params={})
197
+ params = params.dup
198
+
199
+ case query
200
+ when String
201
+ params[:query] = query
202
+ when Hash
203
+ params.merge! query
204
+ end
205
+
206
+ api_params = {}
207
+
208
+ add_query_params(api_params, params)
209
+ add_facet_conditions_params(api_params, params)
210
+ add_boolean_params(api_params, params)
211
+ add_facets_param(api_params, params)
212
+ add_fields_param(api_params, params)
213
+ add_rank_params(api_params, params)
214
+ add_date_params(api_params, params)
215
+ add_offset_params(api_params, params)
216
+
217
+ reply = invoke(api_params)
218
+ parse_reply(reply)
219
+ end
220
+
221
+ private
222
+ def self.date_argument(field_name, arg)
223
+ return arg if arg.is_a? String
224
+ return arg.strftime("%Y%m%d") if arg.respond_to? :strftime
225
+ raise ArgumentError, "Only a string or Date/Time object is allowed as a parameter to the #{field_name} input"
226
+ end
227
+
228
+ def self.facet_params(params, facet_name)
229
+ return nil if params[facet_name].nil?
230
+
231
+ params[facet_name].map {|f| Facet.new(facet_name, f, nil) }
232
+ end
233
+
234
+ def self.text_argument(field, argument)
235
+ arg = argument.dup
236
+ subquery = []
237
+ while term = arg.slice!(%r{("[^"]+")|\S+})
238
+ if term =~ /^\-/
239
+ subquery << "-#{field}:#{term[1..term.length]}"
240
+ else
241
+ subquery << "#{field}:#{term}"
242
+ end
243
+ end
244
+
245
+ subquery.join(' ')
246
+ end
247
+
248
+
249
+ def self.parse_reply(reply)
250
+ ResultSet.init_from_api(reply)
251
+ end
252
+
253
+ def self.add_facets_param(out_params, in_params)
254
+ if in_params[:facets]
255
+ unless in_params[:facets].is_a? Array
256
+ facet_array = [in_params[:facets]]
257
+ else
258
+ facet_array = in_params[:facets]
259
+ end
260
+
261
+ out_params['facets'] = facet_array.map {|f| Facet.symbol_name(f)}.join(',')
262
+ end
263
+ end
264
+
265
+ def self.field_param(name)
266
+ case name.to_s
267
+ when 'thumbnail'
268
+ IMAGE_FIELDS.join(',')
269
+ else
270
+ name.to_s
271
+ end
272
+ end
273
+
274
+ def self.add_fields_param(out_params, in_params)
275
+ case in_params[:fields]
276
+ when nil
277
+ # do nothing
278
+ when :all
279
+ out_params['fields'] = ALL_FIELDS.join(',')
280
+ when :none
281
+ out_params['fields'] = ' '
282
+ unless out_params['facets']
283
+ out_params['facets'] = Facet::DEFAULT_RETURN_FACETS.join(',')
284
+ end
285
+ when String, Symbol
286
+ out_params['fields'] = field_param(in_params[:fields])
287
+ when Array
288
+ out_params['fields'] = in_params[:fields].map {|f| field_param(f)}.join(',')
289
+ else
290
+ raise ArgumentError, "Fields must either be :all, a single field name, or an array of field names (either strings or symbols)"
291
+ end
292
+ end
293
+
294
+ def self.add_query_params(out_params, in_params)
295
+ query = []
296
+
297
+ query << in_params[:query]
298
+
299
+ # Also add other text params to the query
300
+ TEXT_FIELDS.each do |tf|
301
+ if in_params[tf.to_sym]
302
+ query << text_argument(tf, in_params[tf.to_sym])
303
+ end
304
+ end
305
+
306
+ out_params['query'] = query.compact.join(' ')
307
+ out_params['query'] = nil if out_params['query'].empty?
308
+ end
309
+
310
+ def self.facet_argument(name, value, exclude = false)
311
+ if name.is_a? Symbol
312
+ name = Facet.symbol_name(name)
313
+ end
314
+
315
+ "#{'-' if exclude}#{name}:[#{value}]"
316
+ end
317
+
318
+ def self.parse_facet_params(facets, exclude = false)
319
+ facet_args = []
320
+
321
+ case facets
322
+ when nil
323
+ # do nothing
324
+ when String
325
+ facet_args = [facets]
326
+ when Facet
327
+ facet_args = [facet_argument(facets.facet_type, facets.term, exclude)]
328
+ when Array
329
+ unless facets.all? {|f| f.is_a? Facet }
330
+ raise ArgumentError, "Only Facet instances can be passed in as an array; use Hash for Facet::Name => values input"
331
+ end
332
+
333
+ facet_hash = {}
334
+ facets.each do |f|
335
+ unless facet_hash[f.facet_type]
336
+ facet_hash[f.facet_type] = []
337
+ end
338
+
339
+ facet_hash[f.facet_type] << f.term
340
+ end
341
+
342
+ facet_hash.each_pair do |k,v|
343
+ if v.is_a? Array
344
+ facet_args += v.map {|el| facet_argument(k, el, exclude)}
345
+ else
346
+ facet_args << facet_argument(k, v, exclude)
347
+ end
348
+ end
349
+ when Hash
350
+ facets.each_pair do |k,v|
351
+ if v.is_a? Array
352
+ facet_args += v.map {|el| facet_argument(k, el, exclude)}
353
+ else
354
+ facet_args << facet_argument(k, v, exclude)
355
+ end
356
+ end
357
+ end
358
+
359
+ facet_args
360
+ end
361
+
362
+ def self.add_facet_conditions_params(out_params, in_params)
363
+ query = out_params['query']
364
+
365
+ search_facets = parse_facet_params(in_params[:only_facets])
366
+ exclude_facets = parse_facet_params(in_params[:except_facets], true)
367
+
368
+ unless search_facets.empty? && exclude_facets.empty?
369
+ out_params['query'] = ([query] + search_facets + exclude_facets).compact.join(' ')
370
+ end
371
+ end
372
+
373
+ def self.add_boolean_params(out_params, in_params)
374
+ bool_params = []
375
+ query = out_params['query']
376
+
377
+ unless in_params[:fee].nil?
378
+ bool_params << "#{'-' unless in_params[:fee]}fee:Y"
379
+ end
380
+
381
+ unless in_params[:has_multimedia].nil?
382
+ bool_params << "#{'-' unless in_params[:has_multimedia]}related_multimedia:Y"
383
+ end
384
+
385
+ unless in_params[:has_thumbnail].nil?
386
+ bool_params << "#{'-' unless in_params[:has_thumbnail]}small_image:Y"
387
+ end
388
+
389
+ unless bool_params.empty?
390
+ out_params['query'] = ([query] + bool_params).compact.join(' ')
391
+ end
392
+ end
393
+
394
+ def self.add_rank_params(out_params, in_params)
395
+ if in_params[:rank]
396
+ unless [:newest, :oldest, :closest].include?(in_params[:rank])
397
+ raise ArgumentError, "Rank should only be :newest | :oldest | :closest"
398
+ end
399
+
400
+ out_params['rank'] = in_params[:rank].to_s
401
+ end
402
+ end
403
+
404
+ def self.add_date_params(out_params, in_params)
405
+ if in_params[:begin_date]
406
+ out_params['begin_date'] = date_argument(:begin_date, in_params[:begin_date])
407
+ end
408
+
409
+ if in_params[:end_date]
410
+ out_params['end_date'] = date_argument(:end_date, in_params[:end_date])
411
+ end
412
+
413
+ if in_params[:since]
414
+ if in_params[:begin_date]
415
+ raise ArgumentError, "You can't specify both :begin_date and :since as arguments"
416
+ end
417
+
418
+ out_params['begin_date'] = date_argument(:since, in_params[:since])
419
+ end
420
+
421
+ if in_params[:before]
422
+ if in_params[:end_date]
423
+ raise ArgumentError, "You can't specify both :end_date and :before as arguments"
424
+ end
425
+
426
+ out_params['end_date'] = date_argument(:before, in_params[:before])
427
+ end
428
+
429
+ if in_params[:before] && out_params['begin_date'].nil?
430
+ out_params['begin_date'] = EARLIEST_BEGIN_DATE
431
+ end
432
+
433
+ if in_params[:since] && out_params['end_date'].nil?
434
+ out_params['end_date'] = date_argument(:end_date, Date.today + 1)
435
+ end
436
+ end
437
+
438
+ def self.add_offset_params(out_params, in_params)
439
+ if in_params[:page]
440
+ unless in_params[:page].is_a? Integer
441
+ raise ArgumentError, "Page must be an integer"
442
+ end
443
+
444
+ unless in_params[:page] >= 1
445
+ raise ArgumentError, "Page must count up from 1"
446
+ end
447
+
448
+ # Page counts from 1, offset counts from 0
449
+ out_params['offset'] = in_params[:page] - 1
450
+ end
451
+
452
+ if in_params[:offset]
453
+ unless in_params[:offset].is_a? Integer
454
+ raise ArgumentError, "Offset must be an integer"
455
+ end
456
+
457
+ out_params['offset'] = in_params[:offset]
458
+ end
459
+ end
460
+ end
461
+ end
462
+ end