calais 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
Binary file
@@ -1,3 +1,6 @@
1
+ == 0.0.6
2
+ * fully implemented 3.1 API
3
+
1
4
  == 0.0.5
2
5
 
3
6
  * fixed error where classes weren't being required in the proper order on Ubuntu (reported by Jon Moses)
@@ -6,12 +6,14 @@ Rakefile
6
6
  init.rb
7
7
  lib/calais.rb
8
8
  lib/calais/client.rb
9
- lib/calais/name.rb
10
- lib/calais/relationship.rb
11
9
  lib/calais/response.rb
10
+ spec/calais/client_spec.rb
11
+ spec/calais/response_spec.rb
12
12
  spec/calais_spec.rb
13
13
  spec/fixtures/.gitignore
14
- spec/fixtures/bicycles_austrailia.xml
14
+ spec/fixtures/bicycles_australia.response.json
15
+ spec/fixtures/bicycles_australia.response.rdf
16
+ spec/fixtures/bicycles_australia.xml
15
17
  spec/fixtures/calais.yml.sample
16
18
  spec/fixtures/slovenia_euro.xml
17
19
  spec/helper.rb
data/README.txt CHANGED
@@ -6,8 +6,8 @@ A Ruby interface to the Open Calais Web Service (http://opencalais.com)
6
6
  * Accepts documents in text/plain, text/xml and text/html format.
7
7
  * Basic access to the Open Calais API's Enlighten action.
8
8
  * Output is RDF representation of input document.
9
- * Single function ability to tag a document and receive a response in RDF format, names in the document, and their relationships.
10
-
9
+ * Single function ability to extract names, entities and geographies from given text.
10
+
11
11
  == Synopsis
12
12
 
13
13
  This is a very basic wrapper to the Open Calais API. It uses the POST endpoint and currently supports the Enlighten action. Here's a simple call:
@@ -17,16 +17,17 @@ This is a very basic wrapper to the Open Calais API. It uses the POST endpoint a
17
17
  This is the easiest way to get the RDF-formated response from the OpenCalais service.
18
18
 
19
19
  If you want to do something more fun like getting all sorts of fun information about a document, you can try this:
20
-
20
+
21
21
  Calais.process_document(:content => "The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.", :content_type => :text, :license_id => LICENSE_ID)
22
22
 
23
- This will return an object containing the RDF representation of the text, the names in the text, and any relationships that exist there.
23
+ This will return an object containing information extracted from the RDF response.
24
24
 
25
25
  == Requirements
26
26
 
27
27
  * Ruby 1.8.5 or better
28
28
  * Uses the following standard libraries: digest/sha1, net/http, yaml, cgi
29
29
  * libxml2 / libxml-ruby
30
+ * curb, libcurl
30
31
 
31
32
  == Install
32
33
 
@@ -37,3 +38,6 @@ You can install the Calais gem via Rubygems (gem install calais) or by building
37
38
  Abhay Kumar
38
39
  http://opensynapse.net
39
40
 
41
+ == Acknowledgements
42
+
43
+ * Paul Legato (http://www.economaton.com/). Help all around with the new response processor and implementation of the latest API.
data/Rakefile CHANGED
@@ -19,7 +19,7 @@ end
19
19
 
20
20
  desc "Run all specs"
21
21
  Spec::Rake::SpecTask.new do |t|
22
- t.spec_files = FileList["spec/*_spec.rb"].sort
22
+ t.spec_files = FileList["spec/**/*_spec.rb"].sort
23
23
  t.spec_opts = ["--options", "spec/spec.opts"]
24
24
  end
25
25
 
@@ -2,55 +2,58 @@ require 'digest/sha1'
2
2
  require 'net/http'
3
3
  require 'cgi'
4
4
  require 'iconv'
5
+ require 'set'
5
6
 
6
7
  require 'rubygems'
7
8
  require 'xml/libxml'
9
+ require 'json'
10
+ require 'curb'
8
11
 
9
12
  $KCODE = "UTF8"
10
13
  require 'jcode'
11
14
 
12
15
  $:.unshift File.expand_path(File.dirname(__FILE__)) + '/calais'
13
16
 
14
- require 'name'
15
- require 'relationship'
16
- require 'response'
17
17
  require 'client'
18
+ require 'response'
18
19
 
19
20
  module Calais
20
- POST_URL = "http://api.opencalais.com"
21
-
22
- AVAILABLE_OUTPUT_FORMATS = {
23
- :rdf => "XML/RDF"
24
- }
25
- DEFAULT_OUTPUT_FORMAT = :rdf
26
-
21
+ REST_ENDPOINT = "http://api.opencalais.com/enlighten/rest/"
22
+ BETA_REST_ENDPOINT = "http://beta.opencalais.com/enlighten/rest/"
23
+
27
24
  AVAILABLE_CONTENT_TYPES = {
28
- :xml => "TEXT/XML",
29
- :html => "TEXT/HTML",
30
- :text => "TEXT/TXT"
25
+ :xml => 'text/xml',
26
+ :text => 'text/txt',
27
+ :html => 'text/html',
28
+ :raw => 'text/raw'
31
29
  }
32
- DEFAULT_CONTENT_TYPE = :xml
33
-
34
- DEFAULT_SUBMITTER = "calais.rb"
35
-
36
- AVAILABLE_METHODS = {
37
- :enlighten => "/enlighten/calais.asmx/Enlighten"
30
+
31
+ AVAILABLE_OUTPUT_FORMATS = {
32
+ :rdf => 'xml/rdf',
33
+ :simple => 'text/simple',
34
+ :microformats => 'text/microformats',
35
+ :json => 'application/json'
38
36
  }
39
-
37
+
38
+ KNOWN_ENABLES = ['GenericRelations']
39
+ KNOWN_DISCARDS = ['er/Company', 'er/Geo']
40
+
40
41
  MAX_RETRIES = 5
41
-
42
+ HTTP_TIMEOUT = 60
43
+ MIN_CONTENT_SIZE = 100
44
+ MAX_CONTENT_SIZE = 100_000
45
+
42
46
  class << self
43
- def enlighten(*args, &block) Client.new(*args, &block).call(:enlighten) end
47
+ def enlighten(*args, &block); Client.new(*args, &block).enlighten; end
44
48
 
45
- def process_document(*args, &block)
46
- data, error = Calais.enlighten(*args, &block)
47
- process_data(data, error)
49
+ def process_document(*args, &block)
50
+ client = Client.new(*args, &block)
51
+ client.output_format = :rdf
52
+ Response.new(client.enlighten)
48
53
  end
49
-
50
- def process_data(data, error=nil) Response.new(data, error) end
51
54
  end
52
55
  end
53
56
 
54
57
  module Calais
55
- VERSION = '0.0.5'
56
- end
58
+ VERSION = '0.0.6'
59
+ end
@@ -1,47 +1,110 @@
1
1
  module Calais
2
2
  class Client
3
- attr_accessor :license_id
3
+ # base attributes of the call
4
4
  attr_accessor :content
5
- attr_accessor :content_type, :output_format
6
- attr_accessor :allow_distribution, :allow_search, :submitter, :external_id
5
+ attr_accessor :license_id
6
+
7
+ # processing directives
8
+ attr_accessor :content_type, :output_format, :reltag_base_url, :calculate_relevance, :omit_outputting_original_text
9
+ attr_accessor :metadata_enables, :metadata_discards
10
+
11
+ # user directives
12
+ attr_accessor :allow_distribution, :allow_search, :external_id, :submitter
13
+
7
14
  attr_accessor :external_metadata
8
-
15
+
16
+ attr_accessor :use_beta
17
+
9
18
  def initialize(options={}, &block)
10
19
  options.each {|k,v| send("#{k}=", v)}
11
20
  yield(self) if block_given?
12
21
  end
13
-
14
- def call(method)
15
- method = method.intern unless method.is_a?(Symbol)
16
- raise ArgumentError.new("Unknown method: #{method}") unless AVAILABLE_METHODS.keys.include? method
17
-
22
+
23
+ def enlighten
18
24
  post_args = {
19
25
  "licenseID" => @license_id,
20
26
  "content" => Iconv.iconv('UTF-8//IGNORE', 'UTF-8', "#{@content} ").first[0..-2],
21
27
  "paramsXML" => params_xml
22
28
  }
23
-
24
- url = URI.parse(POST_URL + AVAILABLE_METHODS[method])
25
- resp, data = Net::HTTP.post_form(url, post_args)
26
-
27
- return resp.is_a?(Net::HTTPOK) ? data : [data, "API Error: #{resp}"]
29
+
30
+ @client ||= Curl::Easy.new
31
+ @client.url = @use_beta ? BETA_REST_ENDPOINT : REST_ENDPOINT
32
+ @client.timeout = HTTP_TIMEOUT
33
+
34
+ post_fields = post_args.map {|k,v| Curl::PostField.content(k, v) }
35
+
36
+ do_request(post_fields)
28
37
  end
29
-
38
+
39
+ def params_xml
40
+ check_params
41
+
42
+ params_node = XML::Node.new('c:params')
43
+ params_node['xmlns:c'] = 'http://s.opencalais.com/1/pred/'
44
+ params_node['xmlns:rdf'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
45
+
46
+ processing_node = XML::Node.new('c:processingDirectives')
47
+ processing_node['c:contentType'] = AVAILABLE_CONTENT_TYPES[@content_type] if @content_type
48
+ processing_node['c:outputFormat'] = AVAILABLE_OUTPUT_FORMATS[@output_format] if @output_format
49
+ processing_node['c:reltagBaseURL'] = @reltag_base_url.to_s if @reltag_base_url
50
+
51
+ processing_node['c:enableMetadataType'] = @metadata_enables.join(';') unless @metadata_enables.empty?
52
+ processing_node['c:discardMetadata'] = @metadata_discards.join(';') unless @metadata_discards.empty?
53
+ processing_node['c:omitOutputtingOriginalText'] = 'true' if @omit_outputting_original_text
54
+
55
+ user_node = XML::Node.new('c:userDirectives')
56
+ user_node['c:allowDistribution'] = @allow_distribution.to_s unless @allow_distribution.nil?
57
+ user_node['c:allowSearch'] = @allow_search.to_s unless @allow_search.nil?
58
+ user_node['c:externalID'] = @external_id.to_s if @external_id
59
+ user_node['c:submitter'] = @submitter.to_s if @submitter
60
+
61
+ params_node << processing_node
62
+ params_node << user_node
63
+
64
+ if @external_metadata
65
+ external_node = XML::Node.new('c:externalMetadata')
66
+ external_node << @external_metadata
67
+ params_node << external_node
68
+ end
69
+
70
+ params_node.to_s
71
+ end
72
+
30
73
  private
31
- def params_xml
32
- content_type = @content_type && AVAILABLE_CONTENT_TYPES.keys.include?(@content_type) ? AVAILABLE_CONTENT_TYPES[@content_type] : AVAILABLE_CONTENT_TYPES[DEFAULT_CONTENT_TYPE]
33
- output_format = @output_format && AVAILABLE_OUTPUT_FORMATS.keys.include?(@output_format) ? AVAILABLE_OUTPUT_FORMATS[@output_format] : AVAILABLE_OUTPUT_FORMATS[DEFAULT_OUTPUT_FORMAT]
34
- allow_distribution = @allow_distribution ? "true" : "false"
35
- allow_search = @allow_search ? "true" : "false"
36
- submitter = @submitter || DEFAULT_SUBMITTER
37
- external_id = @external_id || Digest::SHA1.hexdigest(@content.inspect)
38
- external_metadata = @external_metadata || ""
39
-
40
- xml = %[<c:params xmlns:c="http://s.opencalais.com/1/pred/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">]
41
- xml += %[<c:processingDirectives c:contentType="#{content_type}" c:outputFormat="#{output_format}"></c:processingDirectives>]
42
- xml += %[<c:userDirectives c:allowDistribution="#{allow_distribution}" c:allowSearch="#{allow_search}" c:externalID="#{external_id}" c:submitter="#{submitter}"></c:userDirectives>]
43
- xml += %[<c:externalMetadata>#{external_metadata}</c:externalMetadata>]
44
- xml += %[</c:params>]
74
+ def check_params
75
+ raise 'missing content' if @content.nil? || @content.empty?
76
+
77
+ content_length = @content.length
78
+ raise 'content is too small' if content_length < MIN_CONTENT_SIZE
79
+ raise 'content is too large' if content_length > MAX_CONTENT_SIZE
80
+
81
+ raise 'missing license id' if @license_id.nil? || @license_id.empty?
82
+
83
+ raise 'unknown content type' unless AVAILABLE_CONTENT_TYPES.keys.include?(@content_type) if @content_type
84
+ raise 'unknown output format' unless AVAILABLE_OUTPUT_FORMATS.keys.include?(@output_format) if @output_format
85
+
86
+ %w[calculate_relevance allow_distribution allow_search].each do |variable|
87
+ value = self.send(variable)
88
+ unless NilClass === value || TrueClass === value || FalseClass === value
89
+ raise "expected a boolean value for #{variable} but got #{value}"
90
+ end
91
+ end
92
+
93
+ @metadata_enables ||= []
94
+ unknown_enables = Set.new(@metadata_enables) - KNOWN_ENABLES
95
+ raise "unknown metadata enables: #{unknown_enables.to_ainspect}" unless unknown_enables.empty?
96
+
97
+ @metadata_discards ||= []
98
+ unknown_discards = Set.new(@metadata_discards) - KNOWN_DISCARDS
99
+ raise "unknown metadata discards: #{unknown_discards.to_ainspect}" unless unknown_discards.empty?
100
+ end
101
+
102
+ def do_request(post_fields)
103
+ unless @client.http_post(post_fields)
104
+ raise 'unable to post to api endpoint'
105
+ end
106
+
107
+ @client.body_str
45
108
  end
46
109
  end
47
110
  end
@@ -1,77 +1,195 @@
1
1
  module Calais
2
2
  class Response
3
- attr_reader :rdf, :names, :relationships, :error
4
-
5
- def initialize(raw, error=nil)
6
- @error = error
7
- @names = []
8
- @relationships = []
9
-
10
- parse_raw(raw)
11
- return if @error
12
-
13
- parse_names
14
- parse_relationships
3
+ MATCHERS = {
4
+ :docinfo => 'DocInfo',
5
+ :docinfometa => 'DocInfoMeta',
6
+ :defaultlangid => 'DefaultLangId',
7
+ :doccat => 'DocCat',
8
+ :entities => 'type/em/e',
9
+ :relations => 'type/em/r',
10
+ :geographies => 'type/er',
11
+ :instances => 'type/sys/InstanceInfo',
12
+ :relevances => 'type/sys/RelevanceInfo',
13
+ }
14
+
15
+ attr_accessor :submitter_code, :signature, :language, :submission_date, :request_id, :doc_title, :doc_date
16
+ attr_accessor :hashes, :entities, :relations, :geographies, :categories
17
+
18
+ def initialize(rdf_string)
19
+ @raw_response = rdf_string
20
+
21
+ @hashes = []
22
+ @entities = []
23
+ @relations = []
24
+ @geographies = []
25
+ @relevances = {} # key = String hash, val = Float relevance
26
+ @categories = []
27
+
28
+ extract_data
29
+ end
30
+
31
+ class Entity
32
+ attr_accessor :hash, :type, :attributes, :relevance, :instances
33
+ end
34
+
35
+ class Relation
36
+ attr_accessor :hash, :type, :attributes, :instances
15
37
  end
16
-
17
- Name::TYPES.each_pair do |method_name, type|
18
- define_method method_name.to_sym do
19
- @names.map {|name| name if name.type == type }.compact
38
+
39
+ class Geography
40
+ attr_accessor :name, :hash, :attributes
41
+ end
42
+
43
+ class Category
44
+ attr_accessor :name, :score
45
+ end
46
+
47
+ class Instance
48
+ attr_accessor :prefix, :exact, :suffix, :offset, :length
49
+
50
+ # Makes a new Instance object from an appropriate LibXML::XML::Node.
51
+ def self.from_node(node)
52
+ instance = self.new
53
+ instance.prefix = node.find_first("c:prefix").content
54
+ instance.exact = node.find_first("c:exact").content
55
+ instance.suffix = node.find_first("c:suffix").content
56
+ instance.offset = node.find_first("c:offset").content.to_i
57
+ instance.length = node.find_first("c:length").content.to_i
58
+
59
+ instance
20
60
  end
21
61
  end
22
-
62
+
63
+ class CalaisHash
64
+ attr_accessor :value
65
+
66
+ def self.find_or_create(hash, hashes)
67
+ if !selected = hashes.select {|h| h.value == hash }.first
68
+ selected = self.new
69
+ selected.value = hash
70
+ hashes << selected
71
+ end
72
+
73
+ selected
74
+ end
75
+ end
76
+
23
77
  private
24
- def parse_raw(raw)
25
- @libxml = XML::Parser.string(XML::Parser.string(raw).parse.root.child.content).parse
26
- @rdf = @libxml.to_s
27
- @error = @libxml.find("/Error/Exception").first.content rescue @error
78
+ def extract_data
79
+ doc = XML::Parser.string(@raw_response).parse
80
+
81
+ doc.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:docinfometa]}')]/..").each do |node|
82
+ @language = node['language']
83
+ @submission_date = DateTime.parse node['submissionDate']
84
+
85
+ attributes = extract_attributes(node.find("*[contains(name(), 'c:')]"))
86
+
87
+ @signature = attributes.delete('signature')
88
+ @submitter_code = attributes.delete('submitterCode')
89
+
90
+ node.remove!
91
+ end
92
+
93
+ doc.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:docinfo]}')]/..").each do |node|
94
+ @request_id = node['calaisRequestID']
95
+
96
+ attributes = extract_attributes(node.find("*[contains(name(), 'c:')]"))
97
+
98
+ @doc_title = attributes.delete('docTitle')
99
+ @doc_date = Date.parse attributes.delete('docDate')
100
+
101
+ node.remove!
102
+ end
103
+
104
+ @categories = doc.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:doccat]}')]/..").map do |node|
105
+ category = Category.new
106
+ category.name = node.find_first("c:categoryName").content
107
+ category.score = node.find_first("c:score").content.to_f
108
+
109
+ node.remove!
110
+ category
111
+ end
112
+
113
+ @relevances = doc.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:relevances]}')]/..").inject({}) do |acc, node|
114
+ subject_hash = node.find_first("c:subject")[:resource].split('/')[-1]
115
+ acc[subject_hash] = node.find_first("c:relevance").content.to_f
116
+
117
+ node.remove!
118
+ acc
119
+ end
120
+
121
+ @entities = doc.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:entities]}')]/..").map do |node|
122
+ extracted_hash = node['about'].split('/')[-1] rescue nil
123
+
124
+ entity = Entity.new
125
+ entity.hash = CalaisHash.find_or_create(extracted_hash, @hashes)
126
+ entity.type = extract_type(node)
127
+ entity.attributes = extract_attributes(node.find("*[contains(name(), 'c:')]"))
128
+
129
+ entity.relevance = @relevances[extracted_hash]
130
+ entity.instances = extract_instances(doc, extracted_hash)
131
+
132
+ node.remove!
133
+ entity
134
+ end
135
+
136
+ @relations = doc.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:relations]}')]/..").map do |node|
137
+ extracted_hash = node['about'].split('/')[-1] rescue nil
138
+
139
+ relation = Relation.new
140
+ relation.hash = CalaisHash.find_or_create(extracted_hash, @hashes)
141
+ relation.type = extract_type(node)
142
+ relation.attributes = extract_attributes(node.find("*[contains(name(), 'c:')]"))
143
+ relation.instances = extract_instances(doc, extracted_hash)
144
+
145
+ node.remove!
146
+ relation
147
+ end
148
+
149
+ @geographies = doc.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:geographies]}')]/..").map do |node|
150
+ attributes = extract_attributes(node.find("*[contains(name(), 'c:')]"))
151
+
152
+ geography = Geography.new
153
+ geography.name = attributes.delete('name')
154
+ geography.hash = attributes.delete('subject')
155
+ geography.attributes = attributes
156
+
157
+ node.remove!
158
+ geography
159
+ end
160
+
161
+ doc.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:defaultlangid]}')]/..").each { |node| node.remove! }
162
+ doc.root.find("./*").each { |node| node.remove! }
163
+
164
+ return
28
165
  end
29
-
30
- def parse_names
31
- @names = @libxml.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '/em/e/')]/..").map do |n|
32
- name = n.find_first("c:name").content
33
- type = n.find_first("rdf:type").properties.to_a.assoc("resource").last.split('/').last
34
- hash = n.properties.to_a.assoc("about").last.split("/").last
35
-
36
- locations = @libxml.root.find("rdf:Description/c:subject[contains(@rdf:resource, '#{hash}')]/..").map do |n2|
37
- start = n2.find_first("c:offset").content.to_i
38
- Range.new(start, start+n2.find_first("c:length").content.to_i)
39
- end
40
-
41
- Name.new(
42
- :name => name,
43
- :hash => hash,
44
- :type => type,
45
- :locations => locations
46
- )
166
+
167
+ def extract_instances(doc, hash)
168
+ doc.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:instances]}')]/..").select do |instance_node|
169
+ instance_node.find_first("c:subject")[:resource].split("/")[-1] == hash
170
+ end.map do |instance_node|
171
+ instance = Instance.from_node(instance_node)
172
+ instance_node.remove!
173
+
174
+ instance
47
175
  end
48
176
  end
49
-
50
- def parse_relationships
51
- @libxml.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '/em/r')]/..").each do |n|
52
- hash = n.properties.to_a.assoc("about").last.split("/").last
53
- type = n.find_first("rdf:type").properties.to_a.assoc("resource").last.split('/').last
54
-
55
- metadata = {}
56
-
57
- n.to_a.each do |n2|
58
- next if n2.name == "type" or n2.comment?
59
- resource = n2.properties.to_a.assoc("resource")
60
- metadata[n2.name] = resource ? Name.find_in_names(resource.last.split("/").last, @names) : n2.content.strip
61
- end
62
-
63
- locations = @libxml.root.find("rdf:Description/c:subject[contains(@rdf:resource, '#{hash}')]/..").map do |n2|
64
- start = n2.find_first("c:offset").content.to_i
65
- Range.new(start, start+n2.find_first("c:length").content.to_i)
66
- end
67
-
68
-
69
- @relationships << Relationship.new(
70
- :type => type,
71
- :hash => hash,
72
- :metadata => metadata,
73
- :locations => locations
74
- )
177
+
178
+ def extract_type(node)
179
+ node.find("*[name()='rdf:type']")[0]['resource'].split('/')[-1]
180
+ rescue
181
+ nil
182
+ end
183
+
184
+ def extract_attributes(nodes)
185
+ nodes.inject({}) do |hsh, node|
186
+ value = if node['resource']
187
+ extracted_hash = node['resource'].split('/')[-1] rescue nil
188
+ CalaisHash.find_or_create(extracted_hash, @hashes)
189
+ else
190
+ node.content
191
+ end
192
+ hsh.merge(node.name => value)
75
193
  end
76
194
  end
77
195
  end