harrisj-nytimes-articles 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :minor: 1
3
+ :patch: 1
4
+ :major: 0
@@ -0,0 +1,8 @@
1
+ # should I be setting this?
2
+ $KCODE = 'UTF8'
3
+
4
+ require File.join(File.dirname(__FILE__), 'nytimes_articles', 'exceptions')
5
+ require File.join(File.dirname(__FILE__), 'nytimes_articles', 'base')
6
+ require File.join(File.dirname(__FILE__), 'nytimes_articles', 'facet')
7
+ require File.join(File.dirname(__FILE__), 'nytimes_articles', 'article')
8
+ require File.join(File.dirname(__FILE__), 'nytimes_articles', 'result_set')
@@ -0,0 +1,397 @@
1
+ require 'rubygems'
2
+
3
+ module Nytimes
4
+ module Articles
5
+ class Article < Base
6
+ RAW_FIELDS = %w(url)
7
+ TEXT_FIELDS = %w(abstract author body byline lead_paragraph nytd_lead_paragraph nytd_title title)
8
+ NUMERIC_FIELDS = %w(word_count)
9
+ BOOLEAN_FIELDS = %w(fee small_image)
10
+ IMAGE_FIELDS = %w(small_image small_image_url small_image_height small_image_width)
11
+ MULTIMEDIA_FIELDS = %w(multimedia related_multimedia)
12
+
13
+ ALL_FIELDS = TEXT_FIELDS + RAW_FIELDS + NUMERIC_FIELDS + BOOLEAN_FIELDS + IMAGE_FIELDS + MULTIMEDIA_FIELDS + Facet::ALL_FACETS
14
+
15
+ attr_reader *ALL_FIELDS
16
+
17
+ # Scalar facets
18
+ attr_reader :page, :column, :pub_month, :pub_year, :pub_day, :day_of_week, :desk, :date, :section_page, :source
19
+
20
+ # Facets that return multiple values
21
+ attr_reader :classifiers, :descriptions, :geo, :material_types, :organizations, :persons, :nytd_bylines, :nytd_descriptions, :nytd_geo, :nytd_organizations, :nytd_persons, :nytd_sections, :nytd_works_mentioned, :works_mentioned
22
+ alias :people :persons
23
+ alias :nytd_people :nytd_persons
24
+
25
+ ##
26
+ # Create a new Article from hash arguments. You really don't need to call this as Article instances are automatically returned from the API
27
+ def initialize(params={})
28
+ params.each_pair do |k,v|
29
+ instance_variable_set("@#{k}", v)
30
+ end
31
+ end
32
+
33
+ ##
34
+ # Is this article available for a fee?
35
+ alias :fee? :fee
36
+
37
+ ##
38
+ # Is this article available for free?
39
+ def free?
40
+ not(fee?)
41
+ end
42
+
43
+ ##
44
+ # Creates a new Article from the a hash returned from the API. This is called on search results. You have no reason to call it.
45
+ def self.init_from_api(params)
46
+ article = Article.new(
47
+ :abstract => text_field(params['abstract']),
48
+ :author => text_field(params['author']),
49
+ :body => text_field(params['body']),
50
+ :byline => text_field(params['byline']),
51
+ :fee => params['fee'] || false,
52
+ :lead_paragraph => text_field(params['lead_paragraph']),
53
+ :nytd_title => text_field(params['nytd_title']),
54
+ :nytd_lead_paragraph => text_field(params['nytd_lead_paragraph']),
55
+ :related_multimedia => nil, # FIXME
56
+ :image => nil, # FIXME
57
+ :title => text_field(params['title']),
58
+ :url => params['url'],
59
+ :word_count => integer_field(params['word_count']),
60
+
61
+ # FACETS THAT RETURN SCALARS
62
+ :page => integer_field(params[Facet::PAGE]),
63
+ :column => text_field(params[Facet::COLUMN]),
64
+ :pub_month => integer_field(params[Facet::PUB_MONTH]),
65
+ :pub_year => integer_field(params[Facet::PUB_YEAR]),
66
+ :pub_day => integer_field(params[Facet::PUB_DAY]),
67
+ :day_of_week => params[Facet::DAY_OF_WEEK],
68
+ :desk => text_field(params[Facet::DESK]),
69
+ :date => date_field(params[Facet::DATE]),
70
+ :section_page => params[Facet::SECTION_PAGE],
71
+ :source => text_field(params[Facet::SOURCE]),
72
+
73
+ # FIXME! MORE FACET PARAMS
74
+ # FACETS THAT RETURN ARRAYS
75
+ :classifiers => facet_params(params, Facet::CLASSIFIERS),
76
+ :descriptions => facet_params(params, Facet::DESCRIPTION),
77
+ :geo => facet_params(params, Facet::GEO),
78
+ :material_types => facet_params(params, Facet::MATERIAL_TYPE),
79
+ :organizations => facet_params(params, Facet::ORGANIZATION),
80
+ :persons => facet_params(params, Facet::PERSON),
81
+ :nytd_bylines => facet_params(params, Facet::NYTD_BYLINE),
82
+ :nytd_descriptions => facet_params(params, Facet::NYTD_DESCRIPTION),
83
+ :nytd_geo => facet_params(params, Facet::NYTD_GEO),
84
+ :nytd_organizations => facet_params(params, Facet::NYTD_ORGANIZATION),
85
+ :nytd_persons => facet_params(params, Facet::NYTD_PERSON),
86
+ :nytd_sections => facet_params(params, Facet::NYTD_SECTION),
87
+ :nytd_works_mentioned => facet_params(params, Facet::NYTD_WORKS_MENTIONED),
88
+ :works_mentioned => facet_params(params, Facet::WORKS_MENTIONED)
89
+ )
90
+
91
+ article
92
+ end
93
+
94
+ ##
95
+ # Executes a search against the Article Search API and returns a ResultSet of 10 articles. At its simplest form, can be invoked
96
+ # with just a string like so
97
+ #
98
+ # Article.search 'dog food'
99
+ #
100
+ # which will do a text search against several text fields in the article and return the most basic fields for each
101
+ # article, but it takes a large number of potential parameters. All of these fields and then some can be returned as display fields
102
+ # in the articles retrieved from search (see the <tt>:fields</tt> argument below)
103
+ #
104
+ # == TEXT FIELDS
105
+ #
106
+ # If passed a string as the first argument, the text will be used to search against the title, byline and body fields of articles. This text takes
107
+ # the following boolean syntax:
108
+ # * <tt>dog food</tt> - similar to doing a boolean =AND search on both terms
109
+ # * <tt>"ice cream"</tt> - matches the words as a phrase in the text
110
+ # * <tt>ice -cream</tt> - to search text that doesn't contain a term, prefix with the minus sign.
111
+ #
112
+ # Should you wish to target text against specific text fields associated with the article, the following named parameters are supported:
113
+ # * <tt>:abstract</tt> - A summary of the article, written by Times indexers
114
+ # * <tt>:body</tt> - A portion of the beginning of the article. Note: Only a portion of the article body is included in responses. But when you search against the body field, you search the full text of the article.
115
+ # * <tt>:byline</tt> - The article byline, including the author's name
116
+ # * <tt>:lead_paragraph</tt> - The first paragraph of the article (as it appeared in the printed newspaper)
117
+ # * <tt>:nytd_byline</tt> - The article byline, formatted for NYTimes.com
118
+ # * <tt>:nytd_lead_paragraph</tt> - The first paragraph of the article (as it appears on NYTimes.com)
119
+ # * <tt>:nytd_title</tt> - The article title on NYTimes.com (this field may or may not match the title field; headlines may be shortened and edited for the Web)
120
+ # * <tt>:text</tt> - The text field consists of title + byline + body (combined in an OR search) and is the default field for keyword searches.
121
+ # * <tt>:title</tt> - The article title (headline); corresponds to the headline that appeared in the printed newspaper
122
+ # * <tt>:url</tt> - The URL of the article on NYTimes.com
123
+ #
124
+ # == FACET SEARCHING
125
+ #
126
+ # Beyond query searches, the NY Times API also allows you to search against controlled vocabulary metadata associated with the article. This is powerful, if you want precise matching against specific
127
+ # people, places, etc (eg, "I want stories about Ford the former president, not Ford the automative company"). The following Facet constants are supported.
128
+ #
129
+ # * <tt>Facet::CLASSIFIERS</tt> - Taxonomic classifiers that reflect Times content categories, such as _Top/News/Sports_
130
+ # * <tt>Facet::COLUMN</tt> - A Times column title (if applicable), such as _Weddings_ or _Ideas & Trends_
131
+ # * <tt>Facet::DATE</tt> - The publication date in YYYYMMDD format
132
+ # * <tt>Facet::DAY_OF_WEEK</tt> - The day of the week (e.g., Monday, Tuesday) the article was published (compare <tt>PUB_DAY</tt>, which is the numeric date rather than the day of the week)
133
+ # * <tt>Facet::DESCRIPTION</tt> - Descriptive subject terms assigned by Times indexers (must be in UPPERCASE)
134
+ # * <tt>Facet::DESK</tt> - The Times desk that produced the story (e.g., _Business/Financial Desk_)
135
+ # * <tt>Facet::GEO</tt> - Standardized names of geographic locations, assigned by Times indexers (must be in UPPERCASE)
136
+ # * <tt>Facet::MATERIAL_TYPE</tt> - The general article type, such as Biography, Editorial or Review
137
+ # * <tt>Facet::ORGANIZATION</tt> - Standardized names of people, assigned by Times indexers (must be UPPERCASE)
138
+ # * <tt>Facet::PAGE</tt> - The page the article appeared on (in the printed paper)
139
+ # * <tt>Facet::PERSON</tt> - Standardized names of people, assigned by Times indexers. When used in a request, values must be UPPERCASE.
140
+ # * <tt>Facet::PUB_DAY</tt> - The day (DD) segment of date, separated for use as facets
141
+ # * <tt>Facet::PUB_MONTH</tt> - The month (MM) segment of date, separated for use as facets
142
+ # * <tt>Facet::PUB_YEAR</tt> - The year (YYYY) segment of date, separated for use as facets
143
+ # * <tt>Facet::SECTION_PAGE</tt> - The full page number of the printed article (e.g., _D00002_)
144
+ # * <tt>Facet::SOURCE</tt> - The originating body (e.g., _AP_, _Dow Jones_, _The New York Times_)
145
+ # * <tt>Facet::WORKS_MENTIONED</tt> - Literary works mentioned in the article
146
+ # * <tt>Facet::NYTD_BYLINE</tt> - The article byline, formatted for NYTimes.com
147
+ # * <tt>Facet::NYTD_DESCRIPTION</tt> - Descriptive subject terms, assigned for use on NYTimes.com (to get standardized terms, use the TimesTags API). When used in a request, values must be Mixed Case
148
+ # * <tt>Facet::NYTD_GEO</tt> - Standardized names of geographic locations, assigned for use on NYTimes.com (to get standardized terms, use the TimesTags API). When used in a request, values must be Mixed Case
149
+ # * <tt>Facet::NYTD_ORGANIZATION</tt> - Standardized names of organizations, assigned for use on NYTimes.com (to get standardized terms, use the TimesTags API). When used in a request, values must be Mixed Case
150
+ # * <tt>Facet::NYTD_PERSON</tt> - Standardized names of people, assigned for use on NYTimes.com (to get standardized terms, use the TimesTags API). When used in a request, values must be Mixed Case.
151
+ # * <tt>Facet::NYTD_SECTION</tt> - The section the article appears in (on NYTimes.com)
152
+ # * <tt>Facet::NYTD_WORKS_MENTIONED</tt> - Literary works mentioned (titles formatted for use on NYTimes.com)
153
+ #
154
+ # The following two search fields are used for facet searching:
155
+ # * <tt>:search_facets</tt> - takes a single value or array of facets to search. Facets can either be specified as array pairs (like <tt>[Facet::GEOGRAPHIC, 'CALIFORNIA']</tt>) or facets returned from a previous search can be passed directly. A single string can be passed as well if you have hand-crafted string.
156
+ # * <tt>:exclude_facets</tt> - similar to <tt>:search_facets</tt> but is used to specify a list of facets to exclude.
157
+ #
158
+ # == OTHER SEARCH FIELDS
159
+ # * <tt>:fee</tt> - to be implemented
160
+ # * <tt>:begin_date</tt>, <tt>:end_date</tt> - the parameters are used to specify a start and end date for search results. BOTH of these must be provided or the API will return an error. Accepts either a Time/Date argument or a string of the format YYYYMMDD. For convenience the following alternative methods are provided
161
+ # * <tt>:before</tt> - an alternative to :end_date. Automatically adds a :before_date of sometime in 1980 if no :since argument is also provided; to be implemented
162
+ # * <tt>:since</tt> - An alternative to :begin_date. Automatically adds an :end_date of Time.now if no :before argument is provided; to be implemented.
163
+ # * <tt>:has_thumbnail</tt> - to be implemented
164
+ # * <tt>:has_multimedia</tt> - to be implemented
165
+ #
166
+ # == FACET SUMMARIES
167
+ #
168
+ # The <tt>:facets</tt> argument can be used to specify up to 5 facet fields to be returned alongside the search that provide overall counts
169
+ # of how much each facet term appears in the search results. FIXME provide list of available facets as well as description of :nytd parameter.
170
+ #
171
+ # == ARTICLE FIELDS
172
+ #
173
+ # The <tt>:fields</tt> parameter is used to indicate what fields are returned with each article from the search results. If not specified, only
174
+ # the following fields are returned for each article: body, byline, date, title, and url. To return specific fields, any of the search fields
175
+ # from above can be explicitly specified in a comma-delimited list, as well as the additional display-only (not searchable) fields below (these
176
+ # are strings or symbols):
177
+ #
178
+ # * <tt>:all</tt> - return all fields for the article
179
+ # * <tt>:none</tt> - display only the facet breakdown and no article results
180
+ # * <tt>:multimedia</tt> - return any related multimedia links for the article
181
+ # * <tt>:thumbnail</tt> - return information for a related thumbnail image (if the article has one)
182
+ # * <tt>:word_count</tt> - the word_count of the article.
183
+ def self.search(query, params={})
184
+ params = params.dup
185
+
186
+ case query
187
+ when String
188
+ params[:query] = query
189
+ when Hash
190
+ params.merge! query
191
+ end
192
+
193
+ api_params = {}
194
+
195
+ add_query_params(api_params, params)
196
+ add_search_facets_param(api_params, params)
197
+ add_boolean_params(api_params, params)
198
+ add_fields_param(api_params, params)
199
+ add_facets_param(api_params, params)
200
+ add_rank_params(api_params, params)
201
+ add_date_params(api_params, params)
202
+ add_offset_params(api_params, params)
203
+
204
+ reply = invoke(api_params)
205
+ parse_reply(reply)
206
+ end
207
+
208
+ private
209
+ def self.date_argument(field_name, arg)
210
+ return arg if arg.is_a? String
211
+ return arg.strftime("%Y%m%d") if arg.respond_to? :strftime
212
+ raise ArgumentError, "Only a string or Date/Time object is allowed as a parameter to the #{field_name} input"
213
+ end
214
+
215
+ def self.facet_params(params, facet_name)
216
+ return nil if params[facet_name].nil?
217
+
218
+ params[facet_name].map {|f| Facet.new(facet_name, f, nil) }
219
+ end
220
+
221
+ def self.text_argument(field, argument)
222
+ arg = argument.dup
223
+ subquery = []
224
+ while term = arg.slice!(%r{("[^"]+")|\S+})
225
+ if term =~ /^\-/
226
+ subquery << "-#{field}:#{term[1..term.length]}"
227
+ else
228
+ subquery << "#{field}:#{term}"
229
+ end
230
+ end
231
+
232
+ subquery.join(' ')
233
+ end
234
+
235
+
236
+ def self.parse_reply(reply)
237
+ ResultSet.init_from_api(reply)
238
+ end
239
+
240
+ def self.add_facets_param(out_params, in_params)
241
+ if in_params[:facets]
242
+ out_params['facets'] = in_params[:facets].to_a.join(',')
243
+ end
244
+ end
245
+
246
+ def self.add_fields_param(out_params, in_params)
247
+ case in_params[:fields]
248
+ when nil
249
+ # do nothing
250
+ when :all
251
+ out_params['fields'] = ALL_FIELDS.join(',')
252
+ when String, Symbol
253
+ out_params['fields'] = in_params[:fields].to_s
254
+ when Array
255
+ out_params['fields'] = in_params[:fields].map {|f| f.to_s}.join(',')
256
+ else
257
+ raise ArgumentError, "Fields must either be :all, a single field name, or an array of field names (either strings or symbols)"
258
+ end
259
+ end
260
+
261
+ def self.add_query_params(out_params, in_params)
262
+ query = []
263
+
264
+ query << in_params[:query]
265
+
266
+ # Also add other text params to the query
267
+ TEXT_FIELDS.each do |tf|
268
+ if in_params[tf.to_sym]
269
+ query << text_argument(tf, in_params[tf.to_sym])
270
+ end
271
+ end
272
+
273
+ out_params['query'] = query.compact.join(' ')
274
+ out_params['query'] = nil if out_params['query'].empty?
275
+ end
276
+
277
+ def self.facet_argument(name, value, exclude = false)
278
+ unless value.is_a? Array
279
+ value = [value]
280
+ end
281
+
282
+ "#{'-' if exclude}#{name}:[#{value.join(',')}]"
283
+ end
284
+
285
+ def self.parse_facet_params(facets, exclude = false)
286
+ search_facets = []
287
+
288
+ case facets
289
+ when nil
290
+ # do nothing
291
+ when String
292
+ search_facets = [facets]
293
+ when Facet
294
+ search_facets = [facet_argument(facets.facet_type, facets.term, exclude)]
295
+ when Array
296
+ unless facets.all? {|f| f.is_a? Facet }
297
+ raise ArgumentError, "Only Facet instances can be passed in as an array; use Hash for Facet::Name => values input"
298
+ end
299
+
300
+ facet_hash = {}
301
+ facets.each do |f|
302
+ unless facet_hash[f.facet_type]
303
+ facet_hash[f.facet_type] = []
304
+ end
305
+
306
+ facet_hash[f.facet_type] << f.term
307
+ end
308
+
309
+ facet_hash.each_pair do |k,v|
310
+ search_facets << facet_argument(k, v, exclude)
311
+ end
312
+ when Hash
313
+ facets.each_pair do |k,v|
314
+ search_facets << facet_argument(k, v, exclude)
315
+ end
316
+ end
317
+
318
+ search_facets
319
+ end
320
+
321
+ def self.add_search_facets_param(out_params, in_params)
322
+ query = out_params['query']
323
+
324
+ search_facets = parse_facet_params(in_params[:search_facets])
325
+ exclude_facets = parse_facet_params(in_params[:exclude_facets], true)
326
+
327
+ unless search_facets.empty? && exclude_facets.empty?
328
+ out_params['query'] = ([query] + search_facets + exclude_facets).compact.join(' ')
329
+ end
330
+ end
331
+
332
+ def self.add_boolean_params(out_params, in_params)
333
+ bool_params = []
334
+ query = out_params['query']
335
+
336
+ unless in_params[:fee].nil?
337
+ bool_params << "#{'-' unless in_params[:fee]}fee:Y"
338
+ end
339
+
340
+ unless in_params[:has_multimedia].nil?
341
+ bool_params << "#{'-' unless in_params[:has_multimedia]}related_multimedia:Y"
342
+ end
343
+
344
+ unless in_params[:has_thumbnail].nil?
345
+ bool_params << "#{'-' unless in_params[:has_thumbnail]}small_image:Y"
346
+ end
347
+
348
+ unless bool_params.empty?
349
+ out_params['query'] = ([query] + bool_params).compact.join(' ')
350
+ end
351
+ end
352
+
353
+ def self.add_rank_params(out_params, in_params)
354
+ if in_params[:rank]
355
+ unless [:newest, :oldest, :closest].include?(in_params[:rank])
356
+ raise ArgumentError, "Rank should only be :newest | :oldest | :closest"
357
+ end
358
+
359
+ out_params['rank'] = in_params[:rank].to_s
360
+ end
361
+ end
362
+
363
+ def self.add_date_params(out_params, in_params)
364
+ if in_params[:begin_date]
365
+ out_params['begin_date'] = date_argument(:begin_date, in_params[:begin_date])
366
+ end
367
+
368
+ if in_params[:end_date]
369
+ out_params['end_date'] = date_argument(:end_date, in_params[:end_date])
370
+ end
371
+ end
372
+
373
+ def self.add_offset_params(out_params, in_params)
374
+ if in_params[:page]
375
+ unless in_params[:page].is_a? Integer
376
+ raise ArgumentError, "Page must be an integer"
377
+ end
378
+
379
+ unless in_params[:page] >= 1
380
+ raise ArgumentError, "Page must count up from 1"
381
+ end
382
+
383
+ # Page counts from 1, offset counts from 0
384
+ out_params['offset'] = in_params[:page] - 1
385
+ end
386
+
387
+ if in_params[:offset]
388
+ unless in_params[:offset].is_a? Integer
389
+ raise ArgumentError, "Offset must be an integer"
390
+ end
391
+
392
+ out_params['offset'] = in_params[:offset]
393
+ end
394
+ end
395
+ end
396
+ end
397
+ end
@@ -0,0 +1,110 @@
1
+ require 'open-uri'
2
+ require 'json'
3
+ require 'htmlentities'
4
+
5
+ module Nytimes
6
+ module Articles
7
+ class Base
8
+ API_SERVER = 'api.nytimes.com'
9
+ API_VERSION = 'v1'
10
+ API_NAME = 'article'
11
+ API_BASE = "/svc/search/#{API_VERSION}/#{API_NAME}"
12
+
13
+ @@api_key = nil
14
+ @@copyright = nil
15
+ @@debug = false
16
+
17
+ ##
18
+ # The copyright footer to be placed at the bottom of any data from the New York Times. Note this is only set after an API call.
19
+ def self.copyright
20
+ @@copyright
21
+ end
22
+
23
+ ##
24
+ # Set the API key used for operations. This needs to be called before any requests against the API. To obtain an API key, go to http://developer.nytimes.com/
25
+ def self.api_key=(key)
26
+ @@api_key = key
27
+ end
28
+
29
+ def self.debug=(flag)
30
+ @@debug = flag
31
+ end
32
+
33
+ ##
34
+ # Returns the current value of the API Key
35
+ def self.api_key
36
+ @@api_key
37
+ end
38
+
39
+ ##
40
+ # Builds a request URI to call the API server
41
+ def self.build_request_url(params)
42
+ URI::HTTP.build :host => API_SERVER,
43
+ :path => API_BASE,
44
+ :query => params.map {|k,v| "#{URI.escape(k)}=#{URI.escape(v)}"}.join('&')
45
+ end
46
+
47
+ def self.text_field(value)
48
+ return nil if value.nil?
49
+ coder = HTMLEntities.new
50
+ coder.decode(value)
51
+ end
52
+
53
+ def self.integer_field(value)
54
+ return nil if value.nil?
55
+ value.to_i
56
+ end
57
+
58
+ def self.date_field(value)
59
+ return nil unless value =~ /^\d{8}$/
60
+ Date.strptime(value, "%Y%m%d")
61
+ end
62
+
63
+ def self.invoke(params={})
64
+ begin
65
+ if @@api_key.nil?
66
+ raise AuthenticationError, "You must initialize the API key before you run any API queries"
67
+ end
68
+
69
+ full_params = params.merge 'api-key' => @@api_key
70
+ uri = build_request_url(full_params)
71
+
72
+ puts "REQUEST: #{uri}" if @@debug
73
+
74
+ reply = uri.read
75
+ parsed_reply = JSON.parse reply
76
+
77
+ if parsed_reply.nil?
78
+ raise BadResponseError, "Empty reply returned from API"
79
+ end
80
+
81
+ #case parsed_reply['status']
82
+ # FIXME
83
+ #end
84
+
85
+ @@copyright = parsed_reply['copyright']
86
+
87
+ parsed_reply
88
+ rescue OpenURI::HTTPError => e
89
+ # FIXME: Return message from body?
90
+ case e.message
91
+ when /^400/
92
+ raise BadRequestError
93
+ when /^403/
94
+ raise AuthenticationError
95
+ when /^404/
96
+ return nil
97
+ when /^500/
98
+ raise ServerError
99
+ else
100
+ raise ConnectionError
101
+ end
102
+
103
+ raise "Error connecting to URL #{uri} #{e}"
104
+ rescue JSON::ParserError => e
105
+ raise BadResponseError, "Invalid JSON returned from API:\n#{reply}"
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end