sshingler-calais 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.markdown +33 -0
- data/MIT-LICENSE +20 -0
- data/README.markdown +49 -0
- data/Rakefile +97 -0
- data/VERSION.yml +4 -0
- data/lib/calais/client.rb +113 -0
- data/lib/calais/error.rb +3 -0
- data/lib/calais/response.rb +201 -0
- data/lib/calais.rb +53 -0
- data/spec/calais/client_spec.rb +79 -0
- data/spec/calais/response_spec.rb +128 -0
- data/spec/helper.rb +12 -0
- metadata +113 -0
data/CHANGELOG.markdown
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Changes
|
2
|
+
|
3
|
+
## 0.0.7
|
4
|
+
* verified 4.0 API
|
5
|
+
* moved gem packaging to `jeweler` and documentation to `yard`
|
6
|
+
|
7
|
+
## 0.0.6
|
8
|
+
* fully implemented 3.1 API
|
9
|
+
|
10
|
+
## 0.0.5
|
11
|
+
* fixed error where classes weren't being required in the proper order on Ubuntu (reported by Jon Moses)
|
12
|
+
* New things coming back from the API. Fixing in tests.
|
13
|
+
|
14
|
+
## 0.0.4
|
15
|
+
* changed dependency from `hpricot` to `libxml`
|
16
|
+
* unicode fun
|
17
|
+
* cleanup all around
|
18
|
+
|
19
|
+
## 0.0.3
|
20
|
+
* pluginized the library for Rails (thanks [pius](http://gitorious.org/projects/calais-au-rails))
|
21
|
+
* added helper methods name entity types from a response
|
22
|
+
|
23
|
+
## 0.0.2
|
24
|
+
* cleanup in the specs
|
25
|
+
* cleaner parsing
|
26
|
+
* location of named entities
|
27
|
+
* more data in relationships
|
28
|
+
* moved Names and Relationships
|
29
|
+
|
30
|
+
## 0.0.1
|
31
|
+
* Access to OpenCalais's Enlighten action
|
32
|
+
* Single method to process a document
|
33
|
+
* Get relationships and names from a document
|
data/MIT-LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2008 Abhay Kumar info@opensynapse.net
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
'Software'), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
17
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
18
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
19
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
20
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# Calais #
|
2
|
+
A Ruby interface to the [Open Calais Web Service](http://opencalais.com)
|
3
|
+
|
4
|
+
## Features ##
|
5
|
+
* Accepts documents in text/plain, text/xml and text/html format.
|
6
|
+
* Basic access to the Open Calais API's Enlighten action.
|
7
|
+
* Output is RDF representation of input document.
|
8
|
+
* Single function ability to extract names, entities and geographies from given text.
|
9
|
+
|
10
|
+
## Synopsis ##
|
11
|
+
|
12
|
+
This is a very basic wrapper to the Open Calais API. It uses the POST endpoint and currently supports the Enlighten action. Here's a simple call:
|
13
|
+
|
14
|
+
Calais.enlighten(
|
15
|
+
:content => "The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program."
|
16
|
+
:content_type => :text,
|
17
|
+
:license_id => 'your license id'
|
18
|
+
)
|
19
|
+
|
20
|
+
This is the easiest way to get the RDF-formated response from the OpenCalais service.
|
21
|
+
|
22
|
+
If you want to do something more fun like getting all sorts of fun information about a document, you can try this:
|
23
|
+
|
24
|
+
Calais.process_document(
|
25
|
+
:content => "The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.",
|
26
|
+
:content_type => :text,
|
27
|
+
:license_id => 'your license id'
|
28
|
+
)
|
29
|
+
|
30
|
+
This will return an object containing information extracted from the RDF response.
|
31
|
+
|
32
|
+
## Requirements ##
|
33
|
+
|
34
|
+
* [Ruby 1.8.5 or better](http://ruby-lang.org)
|
35
|
+
* [nokogiri](http://nokogiri.rubyforge.org/nokogiri/), [libxml2](http://xmlsoft.org/), [libxslt](http://xmlsoft.org/xslt/)
|
36
|
+
* [curb](http://curb.rubyforge.org/), [libcurl](http://curl.haxx.se/)
|
37
|
+
* [json](http://json.rubyforge.org/)
|
38
|
+
|
39
|
+
## Install ##
|
40
|
+
|
41
|
+
You can install the Calais gem via Rubygems (`gem install calais`) or by building from source.
|
42
|
+
|
43
|
+
## Authors ##
|
44
|
+
|
45
|
+
* [Abhay Kumar](http://opensynapse.net)
|
46
|
+
|
47
|
+
## Acknowledgements ##
|
48
|
+
|
49
|
+
* [Paul Legato](http://www.economaton.com/): Help all around with the new response processor and implementation of the 3.1 API.
|
data/Rakefile
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rake'
|
4
|
+
require 'rake/clean'
|
5
|
+
|
6
|
+
require './lib/calais.rb'
|
7
|
+
|
8
|
+
begin
|
9
|
+
gem 'jeweler', '>= 1.0.1'
|
10
|
+
require 'jeweler'
|
11
|
+
|
12
|
+
Jeweler::Tasks.new do |s|
|
13
|
+
s.name = 'calais'
|
14
|
+
s.summary = 'A Ruby interface to the Calais Web Service'
|
15
|
+
s.email = 'info@opensynapse.net'
|
16
|
+
s.homepage = 'http://github.com/abhay/calais'
|
17
|
+
s.description = 'A Ruby interface to the Calais Web Service'
|
18
|
+
s.authors = ['Abhay Kumar']
|
19
|
+
s.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*"]
|
20
|
+
s.rubyforge_project = 'calais'
|
21
|
+
s.add_dependency 'nokogiri', '>= 1.3.3'
|
22
|
+
s.add_dependency 'json', '>= 1.1.3'
|
23
|
+
s.add_dependency 'curb', '>= 0.1.4'
|
24
|
+
end
|
25
|
+
rescue LoadError
|
26
|
+
puts "Jeweler, or one of its dependencies, is not available. Please install it."
|
27
|
+
exit(1)
|
28
|
+
end
|
29
|
+
|
30
|
+
begin
|
31
|
+
require 'spec/rake/spectask'
|
32
|
+
|
33
|
+
desc "Run all specs"
|
34
|
+
Spec::Rake::SpecTask.new do |t|
|
35
|
+
t.spec_files = FileList["spec/**/*_spec.rb"].sort
|
36
|
+
t.spec_opts = ["--options", "spec/spec.opts"]
|
37
|
+
end
|
38
|
+
|
39
|
+
desc "Run all specs and get coverage statistics"
|
40
|
+
Spec::Rake::SpecTask.new('coverage') do |t|
|
41
|
+
t.spec_opts = ["--options", "spec/spec.opts"]
|
42
|
+
t.spec_files = FileList["spec/*_spec.rb"].sort
|
43
|
+
t.rcov_opts = ["--exclude", "spec", "--exclude", "gems"]
|
44
|
+
t.rcov = true
|
45
|
+
end
|
46
|
+
|
47
|
+
task :default => :spec
|
48
|
+
rescue LoadError
|
49
|
+
puts "RSpec, or one of its dependencies, is not available. Please install it."
|
50
|
+
exit(1)
|
51
|
+
end
|
52
|
+
|
53
|
+
begin
|
54
|
+
require 'yard'
|
55
|
+
require 'yard/rake/yardoc_task'
|
56
|
+
|
57
|
+
YARD::Rake::YardocTask.new do |t|
|
58
|
+
t.options = ["--verbose", "--markup=markdown", "--files=CHANGELOG.markdown,MIT-LICENSE"]
|
59
|
+
end
|
60
|
+
|
61
|
+
task :rdoc => :yardoc
|
62
|
+
|
63
|
+
CLOBBER.include 'doc'
|
64
|
+
CLOBBER.include '.yardoc'
|
65
|
+
rescue LoadError
|
66
|
+
puts "Yard, or one of its dependencies is not available. Please install it."
|
67
|
+
exit(1)
|
68
|
+
end
|
69
|
+
|
70
|
+
begin
|
71
|
+
require 'rake/contrib/sshpublisher'
|
72
|
+
namespace :rubyforge do
|
73
|
+
|
74
|
+
desc "Release gem and RDoc documentation to RubyForge"
|
75
|
+
task :release => ["rubyforge:release:gem", "rubyforge:release:docs"]
|
76
|
+
|
77
|
+
namespace :release do
|
78
|
+
desc "Publish RDoc to RubyForge."
|
79
|
+
task :docs => [:yardoc] do
|
80
|
+
config = YAML.load(
|
81
|
+
File.read(File.expand_path('~/.rubyforge/user-config.yml'))
|
82
|
+
)
|
83
|
+
|
84
|
+
host = "#{config['username']}@rubyforge.org"
|
85
|
+
remote_dir = "/var/www/gforge-projects/calais/"
|
86
|
+
local_dir = 'doc'
|
87
|
+
|
88
|
+
Rake::SshDirPublisher.new(host, remote_dir, local_dir).upload
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
rescue LoadError
|
93
|
+
puts "Rake SshDirPublisher is unavailable or your rubyforge environment is not configured."
|
94
|
+
exit(1)
|
95
|
+
end
|
96
|
+
|
97
|
+
# vim: syntax=Ruby
|
data/VERSION.yml
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
module Calais
|
2
|
+
class Client
|
3
|
+
# base attributes of the call
|
4
|
+
attr_accessor :content
|
5
|
+
attr_accessor :license_id
|
6
|
+
|
7
|
+
# processing directives
|
8
|
+
attr_accessor :content_type, :output_format, :reltag_base_url, :calculate_relevance, :omit_outputting_original_text
|
9
|
+
attr_accessor :store_rdf, :metadata_enables, :metadata_discards
|
10
|
+
|
11
|
+
# user directives
|
12
|
+
attr_accessor :allow_distribution, :allow_search, :external_id, :submitter
|
13
|
+
|
14
|
+
attr_accessor :external_metadata
|
15
|
+
|
16
|
+
attr_accessor :use_beta
|
17
|
+
|
18
|
+
def initialize(options={}, &block)
|
19
|
+
options.each {|k,v| send("#{k}=", v)}
|
20
|
+
yield(self) if block_given?
|
21
|
+
end
|
22
|
+
|
23
|
+
def enlighten
|
24
|
+
post_args = {
|
25
|
+
"licenseID" => @license_id,
|
26
|
+
"content" => Iconv.iconv('UTF-8//IGNORE', 'UTF-8', "#{@content} ").first[0..-2],
|
27
|
+
"paramsXML" => params_xml
|
28
|
+
}
|
29
|
+
|
30
|
+
@client ||= Curl::Easy.new
|
31
|
+
@client.url = @use_beta ? BETA_REST_ENDPOINT : REST_ENDPOINT
|
32
|
+
@client.timeout = HTTP_TIMEOUT
|
33
|
+
|
34
|
+
post_fields = post_args.map {|k,v| Curl::PostField.content(k, v) }
|
35
|
+
|
36
|
+
do_request(post_fields)
|
37
|
+
end
|
38
|
+
|
39
|
+
def params_xml
|
40
|
+
check_params
|
41
|
+
document = Nokogiri::XML::Document.new
|
42
|
+
|
43
|
+
params_node = Nokogiri::XML::Node.new('c:params', document)
|
44
|
+
params_node['xmlns:c'] = 'http://s.opencalais.com/1/pred/'
|
45
|
+
params_node['xmlns:rdf'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
46
|
+
|
47
|
+
processing_node = Nokogiri::XML::Node.new('c:processingDirectives', document)
|
48
|
+
processing_node['c:contentType'] = AVAILABLE_CONTENT_TYPES[@content_type] if @content_type
|
49
|
+
processing_node['c:outputFormat'] = AVAILABLE_OUTPUT_FORMATS[@output_format] if @output_format
|
50
|
+
processing_node['c:calculateRelevanceScore'] = 'false' if @calculate_relevance == false
|
51
|
+
processing_node['c:reltagBaseURL'] = @reltag_base_url.to_s if @reltag_base_url
|
52
|
+
|
53
|
+
processing_node['c:enableMetadataType'] = @metadata_enables.join(',') unless @metadata_enables.empty?
|
54
|
+
processing_node['c:docRDFaccessible'] = @store_rdf if @store_rdf
|
55
|
+
processing_node['c:discardMetadata'] = @metadata_discards.join(';') unless @metadata_discards.empty?
|
56
|
+
processing_node['c:omitOutputtingOriginalText'] = 'true' if @omit_outputting_original_text
|
57
|
+
|
58
|
+
user_node = Nokogiri::XML::Node.new('c:userDirectives', document)
|
59
|
+
user_node['c:allowDistribution'] = @allow_distribution.to_s unless @allow_distribution.nil?
|
60
|
+
user_node['c:allowSearch'] = @allow_search.to_s unless @allow_search.nil?
|
61
|
+
user_node['c:externalID'] = @external_id.to_s if @external_id
|
62
|
+
user_node['c:submitter'] = @submitter.to_s if @submitter
|
63
|
+
|
64
|
+
params_node << processing_node
|
65
|
+
params_node << user_node
|
66
|
+
|
67
|
+
if @external_metadata
|
68
|
+
external_node = Nokogiri::XML::Node.new('c:externalMetadata', document)
|
69
|
+
external_node << @external_metadata
|
70
|
+
params_node << external_node
|
71
|
+
end
|
72
|
+
|
73
|
+
params_node.to_xml(:indent => 2)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
def check_params
|
78
|
+
raise 'missing content' if @content.nil? || @content.empty?
|
79
|
+
|
80
|
+
content_length = @content.length
|
81
|
+
raise 'content is too small' if content_length < MIN_CONTENT_SIZE
|
82
|
+
raise 'content is too large' if content_length > MAX_CONTENT_SIZE
|
83
|
+
|
84
|
+
raise 'missing license id' if @license_id.nil? || @license_id.empty?
|
85
|
+
|
86
|
+
raise 'unknown content type' unless AVAILABLE_CONTENT_TYPES.keys.include?(@content_type) if @content_type
|
87
|
+
raise 'unknown output format' unless AVAILABLE_OUTPUT_FORMATS.keys.include?(@output_format) if @output_format
|
88
|
+
|
89
|
+
%w[calculate_relevance store_rdf allow_distribution allow_search].each do |variable|
|
90
|
+
value = self.send(variable)
|
91
|
+
unless NilClass === value || TrueClass === value || FalseClass === value
|
92
|
+
raise "expected a boolean value for #{variable} but got #{value}"
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
@metadata_enables ||= []
|
97
|
+
unknown_enables = Set.new(@metadata_enables) - KNOWN_ENABLES
|
98
|
+
raise "unknown metadata enables: #{unknown_enables.to_a.inspect}" unless unknown_enables.empty?
|
99
|
+
|
100
|
+
@metadata_discards ||= []
|
101
|
+
unknown_discards = Set.new(@metadata_discards) - KNOWN_DISCARDS
|
102
|
+
raise "unknown metadata discards: #{unknown_discards.to_a.inspect}" unless unknown_discards.empty?
|
103
|
+
end
|
104
|
+
|
105
|
+
def do_request(post_fields)
|
106
|
+
unless @client.http_post(post_fields)
|
107
|
+
raise 'unable to post to api endpoint'
|
108
|
+
end
|
109
|
+
|
110
|
+
@client.body_str
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
data/lib/calais/error.rb
ADDED
@@ -0,0 +1,201 @@
|
|
1
|
+
module Calais
|
2
|
+
class Response
|
3
|
+
MATCHERS = {
|
4
|
+
:docinfo => 'DocInfo',
|
5
|
+
:docinfometa => 'DocInfoMeta',
|
6
|
+
:defaultlangid => 'DefaultLangId',
|
7
|
+
:doccat => 'DocCat',
|
8
|
+
:entities => 'type/em/e',
|
9
|
+
:relations => 'type/em/r',
|
10
|
+
:geographies => 'type/er',
|
11
|
+
:instances => 'type/sys/InstanceInfo',
|
12
|
+
:relevances => 'type/sys/RelevanceInfo',
|
13
|
+
}
|
14
|
+
|
15
|
+
attr_accessor :submitter_code, :signature, :language, :submission_date, :request_id, :doc_title, :doc_date
|
16
|
+
attr_accessor :hashes, :entities, :relations, :geographies, :categories
|
17
|
+
|
18
|
+
def initialize(rdf_string)
|
19
|
+
@raw_response = rdf_string
|
20
|
+
|
21
|
+
@hashes = []
|
22
|
+
@entities = []
|
23
|
+
@relations = []
|
24
|
+
@geographies = []
|
25
|
+
@relevances = {} # key = String hash, val = Float relevance
|
26
|
+
@categories = []
|
27
|
+
|
28
|
+
extract_data
|
29
|
+
end
|
30
|
+
|
31
|
+
class Entity
|
32
|
+
attr_accessor :calais_hash, :type, :attributes, :relevance, :instances
|
33
|
+
end
|
34
|
+
|
35
|
+
class Relation
|
36
|
+
attr_accessor :calais_hash, :type, :attributes, :instances
|
37
|
+
end
|
38
|
+
|
39
|
+
class Geography
|
40
|
+
attr_accessor :name, :calais_hash, :attributes
|
41
|
+
end
|
42
|
+
|
43
|
+
class Category
|
44
|
+
attr_accessor :name, :score
|
45
|
+
end
|
46
|
+
|
47
|
+
class Instance
|
48
|
+
attr_accessor :prefix, :exact, :suffix, :offset, :length
|
49
|
+
|
50
|
+
# Makes a new Instance object from an appropriate Nokogiri::XML::Node.
|
51
|
+
def self.from_node(node)
|
52
|
+
instance = self.new
|
53
|
+
instance.prefix = node.xpath("c:prefix[1]").first.content
|
54
|
+
instance.exact = node.xpath("c:exact[1]").first.content
|
55
|
+
instance.suffix = node.xpath("c:suffix[1]").first.content
|
56
|
+
instance.offset = node.xpath("c:offset[1]").first.content.to_i
|
57
|
+
instance.length = node.xpath("c:length[1]").first.content.to_i
|
58
|
+
|
59
|
+
instance
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
class CalaisHash
|
64
|
+
attr_accessor :value
|
65
|
+
|
66
|
+
def self.find_or_create(hash, hashes)
|
67
|
+
if !selected = hashes.select {|h| h.value == hash }.first
|
68
|
+
selected = self.new
|
69
|
+
selected.value = hash
|
70
|
+
hashes << selected
|
71
|
+
end
|
72
|
+
|
73
|
+
selected
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
def extract_data
|
79
|
+
doc = Nokogiri::XML(@raw_response)
|
80
|
+
|
81
|
+
if doc.root.xpath("/Error[1]").first
|
82
|
+
raise Calais::Error, doc.root.xpath("/Error/Exception").first.content
|
83
|
+
end
|
84
|
+
|
85
|
+
doc.root.xpath("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:docinfometa]}')]/..").each do |node|
|
86
|
+
@language = node['language']
|
87
|
+
@submission_date = DateTime.parse node['submissionDate']
|
88
|
+
|
89
|
+
attributes = extract_attributes(node.xpath("*[contains(name(), 'c:')]"))
|
90
|
+
|
91
|
+
@signature = attributes.delete('signature')
|
92
|
+
@submitter_code = attributes.delete('submitterCode')
|
93
|
+
|
94
|
+
node.remove
|
95
|
+
end
|
96
|
+
|
97
|
+
doc.root.xpath("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:docinfo]}')]/..").each do |node|
|
98
|
+
@request_id = node['calaisRequestID']
|
99
|
+
|
100
|
+
attributes = extract_attributes(node.xpath("*[contains(name(), 'c:')]"))
|
101
|
+
|
102
|
+
@doc_title = attributes.delete('docTitle')
|
103
|
+
@doc_date = Date.parse(attributes.delete('docDate'))
|
104
|
+
|
105
|
+
node.remove
|
106
|
+
end
|
107
|
+
|
108
|
+
@categories = doc.root.xpath("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:doccat]}')]/..").map do |node|
|
109
|
+
category = Category.new
|
110
|
+
category.name = node.xpath("c:categoryName[1]").first.content
|
111
|
+
score = node.xpath("c:score[1]").first
|
112
|
+
category.score = score.content.to_f unless score.nil?
|
113
|
+
|
114
|
+
node.remove
|
115
|
+
category
|
116
|
+
end
|
117
|
+
|
118
|
+
@relevances = doc.root.xpath("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:relevances]}')]/..").inject({}) do |acc, node|
|
119
|
+
subject_hash = node.xpath("c:subject[1]").first[:resource].split('/')[-1]
|
120
|
+
acc[subject_hash] = node.xpath("c:relevance[1]").first.content.to_f
|
121
|
+
|
122
|
+
node.remove
|
123
|
+
acc
|
124
|
+
end
|
125
|
+
|
126
|
+
@entities = doc.root.xpath("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:entities]}')]/..").map do |node|
|
127
|
+
extracted_hash = node['about'].split('/')[-1] rescue nil
|
128
|
+
|
129
|
+
entity = Entity.new
|
130
|
+
entity.calais_hash = CalaisHash.find_or_create(extracted_hash, @hashes)
|
131
|
+
entity.type = extract_type(node)
|
132
|
+
entity.attributes = extract_attributes(node.xpath("*[contains(name(), 'c:')]"))
|
133
|
+
|
134
|
+
entity.relevance = @relevances[extracted_hash]
|
135
|
+
entity.instances = extract_instances(doc, extracted_hash)
|
136
|
+
|
137
|
+
node.remove
|
138
|
+
entity
|
139
|
+
end
|
140
|
+
|
141
|
+
@relations = doc.root.xpath("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:relations]}')]/..").map do |node|
|
142
|
+
extracted_hash = node['about'].split('/')[-1] rescue nil
|
143
|
+
|
144
|
+
relation = Relation.new
|
145
|
+
relation.calais_hash = CalaisHash.find_or_create(extracted_hash, @hashes)
|
146
|
+
relation.type = extract_type(node)
|
147
|
+
relation.attributes = extract_attributes(node.xpath("*[contains(name(), 'c:')]"))
|
148
|
+
relation.instances = extract_instances(doc, extracted_hash)
|
149
|
+
|
150
|
+
node.remove
|
151
|
+
relation
|
152
|
+
end
|
153
|
+
|
154
|
+
@geographies = doc.root.xpath("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:geographies]}')]/..").map do |node|
|
155
|
+
attributes = extract_attributes(node.xpath("*[contains(name(), 'c:')]"))
|
156
|
+
|
157
|
+
geography = Geography.new
|
158
|
+
geography.name = attributes.delete('name')
|
159
|
+
geography.calais_hash = attributes.delete('subject')
|
160
|
+
geography.attributes = attributes
|
161
|
+
|
162
|
+
node.remove
|
163
|
+
geography
|
164
|
+
end
|
165
|
+
|
166
|
+
doc.root.xpath("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:defaultlangid]}')]/..").each { |node| node.remove }
|
167
|
+
doc.root.xpath("./*").each { |node| node.remove }
|
168
|
+
|
169
|
+
return
|
170
|
+
end
|
171
|
+
|
172
|
+
def extract_instances(doc, hash)
|
173
|
+
doc.root.xpath("rdf:Description/rdf:type[contains(@rdf:resource, '#{MATCHERS[:instances]}')]/..").select do |instance_node|
|
174
|
+
instance_node.xpath("c:subject[1]").first[:resource].split("/")[-1] == hash
|
175
|
+
end.map do |instance_node|
|
176
|
+
instance = Instance.from_node(instance_node)
|
177
|
+
instance_node.remove
|
178
|
+
|
179
|
+
instance
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def extract_type(node)
|
184
|
+
node.xpath("*[name()='rdf:type']")[0]['resource'].split('/')[-1]
|
185
|
+
rescue
|
186
|
+
nil
|
187
|
+
end
|
188
|
+
|
189
|
+
def extract_attributes(nodes)
|
190
|
+
nodes.inject({}) do |hsh, node|
|
191
|
+
value = if node['resource']
|
192
|
+
extracted_hash = node['resource'].split('/')[-1] rescue nil
|
193
|
+
CalaisHash.find_or_create(extracted_hash, @hashes)
|
194
|
+
else
|
195
|
+
node.content
|
196
|
+
end
|
197
|
+
hsh.merge(node.name => value)
|
198
|
+
end
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
data/lib/calais.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'net/http'
|
3
|
+
require 'cgi'
|
4
|
+
require 'iconv'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
require 'rubygems'
|
8
|
+
require 'nokogiri'
|
9
|
+
require 'json'
|
10
|
+
require 'curb'
|
11
|
+
|
12
|
+
$:.unshift File.expand_path(File.dirname(__FILE__)) + '/calais'
|
13
|
+
|
14
|
+
require 'client'
|
15
|
+
require 'response'
|
16
|
+
require 'error'
|
17
|
+
|
18
|
+
module Calais
|
19
|
+
REST_ENDPOINT = "http://api.opencalais.com/enlighten/rest/"
|
20
|
+
BETA_REST_ENDPOINT = "http://beta.opencalais.com/enlighten/rest/"
|
21
|
+
|
22
|
+
AVAILABLE_CONTENT_TYPES = {
|
23
|
+
:xml => 'text/xml',
|
24
|
+
:html => 'text/html',
|
25
|
+
:htmlraw => 'text/htmlraw',
|
26
|
+
:raw => 'text/raw'
|
27
|
+
}
|
28
|
+
|
29
|
+
AVAILABLE_OUTPUT_FORMATS = {
|
30
|
+
:rdf => 'xml/rdf',
|
31
|
+
:simple => 'text/simple',
|
32
|
+
:microformats => 'text/microformats',
|
33
|
+
:json => 'application/json'
|
34
|
+
}
|
35
|
+
|
36
|
+
KNOWN_ENABLES = ['GenericRelations', 'SocialTags']
|
37
|
+
KNOWN_DISCARDS = ['er/Company', 'er/Geo', 'er/Product']
|
38
|
+
|
39
|
+
MAX_RETRIES = 5
|
40
|
+
HTTP_TIMEOUT = 60
|
41
|
+
MIN_CONTENT_SIZE = 1
|
42
|
+
MAX_CONTENT_SIZE = 100_000
|
43
|
+
|
44
|
+
class << self
|
45
|
+
def enlighten(*args, &block); Client.new(*args, &block).enlighten; end
|
46
|
+
|
47
|
+
def process_document(*args, &block)
|
48
|
+
client = Client.new(*args, &block)
|
49
|
+
client.output_format = :rdf
|
50
|
+
Response.new(client.enlighten)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), %w[.. helper])
|
2
|
+
|
3
|
+
describe Calais::Client, :new do
|
4
|
+
it 'accepts arguments as a hash' do
|
5
|
+
client = nil
|
6
|
+
|
7
|
+
lambda { client = Calais::Client.new(:content => SAMPLE_DOCUMENT, :license_id => LICENSE_ID) }.should_not raise_error
|
8
|
+
|
9
|
+
client.license_id.should == LICENSE_ID
|
10
|
+
client.content.should == SAMPLE_DOCUMENT
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'accepts arguments as a block' do
|
14
|
+
client = nil
|
15
|
+
|
16
|
+
lambda {
|
17
|
+
client = Calais::Client.new do |c|
|
18
|
+
c.content = SAMPLE_DOCUMENT
|
19
|
+
c.license_id = LICENSE_ID
|
20
|
+
end
|
21
|
+
}.should_not raise_error
|
22
|
+
|
23
|
+
client.license_id.should == LICENSE_ID
|
24
|
+
client.content.should == SAMPLE_DOCUMENT
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should not accept unknown attributes' do
|
28
|
+
lambda { Calais::Client.new(:monkey => 'monkey', :license_id => LICENSE_ID) }.should raise_error(NoMethodError)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
describe Calais::Client, :params_xml do
|
33
|
+
it 'returns an xml encoded string' do
|
34
|
+
client = Calais::Client.new(:content => SAMPLE_DOCUMENT, :license_id => LICENSE_ID)
|
35
|
+
client.params_xml.should == %[<c:params xmlns:c=\"http://s.opencalais.com/1/pred/\" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">\n <c:processingDirectives/>\n <c:userDirectives/>\n</c:params>]
|
36
|
+
|
37
|
+
client.content_type = :xml
|
38
|
+
client.output_format = :json
|
39
|
+
client.reltag_base_url = 'http://opencalais.com'
|
40
|
+
client.calculate_relevance = true
|
41
|
+
client.metadata_enables = Calais::KNOWN_ENABLES
|
42
|
+
client.metadata_discards = Calais::KNOWN_DISCARDS
|
43
|
+
client.allow_distribution = true
|
44
|
+
client.allow_search = true
|
45
|
+
client.external_id = Digest::SHA1.hexdigest(client.content)
|
46
|
+
client.submitter = 'calais.rb'
|
47
|
+
|
48
|
+
client.params_xml.should == %[<c:params xmlns:c="http://s.opencalais.com/1/pred/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n <c:processingDirectives c:contentType="text/xml" c:outputFormat="application/json" c:reltagBaseURL="http://opencalais.com" c:enableMetadataType="GenericRelations,SocialTags" c:discardMetadata="er/Company;er/Geo;er/Product"/>\n <c:userDirectives c:allowDistribution="true" c:allowSearch="true" c:externalID="1a008b91e7d21962e132bc1d6cb252532116a606" c:submitter="calais.rb"/>\n</c:params>]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe Calais::Client, :enlighten do
|
53
|
+
before do
|
54
|
+
@client = Calais::Client.new do |c|
|
55
|
+
c.content = SAMPLE_DOCUMENT
|
56
|
+
c.license_id = LICENSE_ID
|
57
|
+
c.content_type = :xml
|
58
|
+
c.output_format = :json
|
59
|
+
c.calculate_relevance = true
|
60
|
+
c.metadata_enables = Calais::KNOWN_ENABLES
|
61
|
+
c.allow_distribution = true
|
62
|
+
c.allow_search = true
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'provides access to the enlighten command on the generic rest endpoint' do
|
67
|
+
@client.should_receive(:do_request).with(anything).and_return(SAMPLE_RESPONSE)
|
68
|
+
@client.enlighten
|
69
|
+
@client.instance_variable_get(:@client).url.should == Calais::REST_ENDPOINT
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'provides access to the enlighten command on the beta rest endpoint' do
|
73
|
+
@client.use_beta = true
|
74
|
+
|
75
|
+
@client.should_receive(:do_request).with(anything).and_return(SAMPLE_RESPONSE)
|
76
|
+
@client.enlighten
|
77
|
+
@client.instance_variable_get(:@client).url.should == Calais::BETA_REST_ENDPOINT
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), %w[.. helper])
|
2
|
+
|
3
|
+
describe Calais::Response, :new do
|
4
|
+
it 'accepts an rdf string to generate the response object' do
|
5
|
+
lambda { Calais::Response.new(SAMPLE_RESPONSE) }.should_not raise_error
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
describe Calais::Response, :new do
|
10
|
+
it "should return error message in runtime error" do
|
11
|
+
lambda {
|
12
|
+
@response = Calais::Response.new(RESPONSE_WITH_EXCEPTION)
|
13
|
+
}.should raise_error(Calais::Error, "My Error Message")
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
describe Calais::Response, :new do
|
18
|
+
before :all do
|
19
|
+
@response = Calais::Response.new(SAMPLE_RESPONSE)
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should extract document information' do
|
23
|
+
@response.language.should == 'English'
|
24
|
+
@response.submission_date.should be_a_kind_of(DateTime)
|
25
|
+
@response.signature.should be_a_kind_of(String)
|
26
|
+
@response.submitter_code.should be_a_kind_of(String)
|
27
|
+
@response.request_id.should be_a_kind_of(String)
|
28
|
+
@response.doc_title.should == 'Record number of bicycles sold in Australia in 2006'
|
29
|
+
@response.doc_date.should be_a_kind_of(Date)
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should extract entities' do
|
33
|
+
entities = @response.entities
|
34
|
+
entities.map { |e| e.type }.sort.uniq.should == %w[City Continent Country IndustryTerm Organization Person Position ProvinceOrState]
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'should extract relations' do
|
38
|
+
relations = @response.relations
|
39
|
+
relations.map { |e| e.type }.sort.uniq.should == %w[GenericRelations PersonAttributes PersonCareer Quotation]
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'should extract geographies' do
|
43
|
+
geographies = @response.geographies
|
44
|
+
geographies.map { |e| e.name }.sort.uniq.should == %w[Australia Hobart,Tasmania,Australia Tasmania,Australia]
|
45
|
+
end
|
46
|
+
|
47
|
+
it 'should extract relevances' do
|
48
|
+
@response.instance_variable_get(:@relevances).should be_a_kind_of(Hash)
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should assign a floating-point relevance to each entity' do
|
52
|
+
@response.entities.each {|e| e.relevance.should be_a_kind_of(Float) }
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'should find the correct document categories returned by OpenCalais' do
|
56
|
+
@response.categories.map {|c| c.name }.sort.should == %w[Business_Finance Technology_Internet]
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'should find the correct document category scores returned by OpenCalais' do
|
60
|
+
@response.categories.map {|c| c.score.should be_a_kind_of(Float) }
|
61
|
+
end
|
62
|
+
|
63
|
+
it "should not raise an error if no score is given by OpenCalais" do
|
64
|
+
lambda {Calais::Response.new(SAMPLE_RESPONSE_WITH_NO_SCORE)}.should_not raise_error
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should not raise an error if no score is given by OpenCalais" do
|
68
|
+
response = Calais::Response.new(SAMPLE_RESPONSE_WITH_NO_SCORE)
|
69
|
+
response.categories.map {|c| c.score }.should == [nil]
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'should find instances for each entity' do
|
73
|
+
@response.entities.each {|e|
|
74
|
+
e.instances.size.should > 0
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
it 'should find instances for each relation' do
|
80
|
+
@response.relations.each {|r|
|
81
|
+
r.instances.size.should > 0
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should find the correct instances for each entity' do
|
86
|
+
## This currently tests only for the "Australia" entity's
|
87
|
+
## instances. A more thorough test that tests for the instances
|
88
|
+
## of each of the many entities in the sample doc is desirable in
|
89
|
+
## the future.
|
90
|
+
|
91
|
+
australia = @response.entities.select {|e| e.attributes["name"] == "Australia" }.first
|
92
|
+
australia.instances.size.should == 3
|
93
|
+
instances = australia.instances.sort{|a,b| a.offset <=> b.offset }
|
94
|
+
|
95
|
+
instances[0].prefix.should == "number of bicycles sold in "
|
96
|
+
instances[0].exact.should == "Australia"
|
97
|
+
instances[0].suffix.should == " in 2006<\/title>\n<date>January 4,"
|
98
|
+
instances[0].offset.should == 67
|
99
|
+
instances[0].length.should == 9
|
100
|
+
|
101
|
+
instances[1].prefix.should == "4, 2007<\/date>\n<body>\nBicycle sales in "
|
102
|
+
instances[1].exact.should == "Australia"
|
103
|
+
instances[1].suffix.should == " have recorded record sales of 1,273,781 units"
|
104
|
+
instances[1].offset.should == 146
|
105
|
+
instances[1].length.should == 9
|
106
|
+
|
107
|
+
instances[2].prefix.should == " the traditional company car,\" he said.\n\n\"Some of "
|
108
|
+
instances[2].exact.should == "Australia"
|
109
|
+
instances[2].suffix.should == "'s biggest corporations now have bicycle fleets,"
|
110
|
+
instances[2].offset.should == 952
|
111
|
+
instances[2].length.should == 9
|
112
|
+
end
|
113
|
+
|
114
|
+
it 'should find the correct instances for each relation' do
|
115
|
+
## This currently tests only for one relation's instances. A more
|
116
|
+
## thorough test that tests for the instances of each of the many other
|
117
|
+
## relations in the sample doc is desirable in the future.
|
118
|
+
|
119
|
+
rel = @response.relations.select {|e| e.calais_hash.value == "8f3936d9-cf6b-37fc-ae0d-a145959ae3b5" }.first
|
120
|
+
rel.instances.size.should == 1
|
121
|
+
|
122
|
+
rel.instances.first.prefix.should == " manufacturers.\n\nThe Cycling Promotion Fund (CPF) "
|
123
|
+
rel.instances.first.exact.should == "spokesman Ian Christie said Australians were increasingly using bicycles as an alternative to cars."
|
124
|
+
rel.instances.first.suffix.should == " Sales rose nine percent in 2006 while the car"
|
125
|
+
rel.instances.first.offset.should == 425
|
126
|
+
rel.instances.first.length.should == 99
|
127
|
+
end
|
128
|
+
end
|
data/spec/helper.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spec'
|
3
|
+
require 'yaml'
|
4
|
+
|
5
|
+
require File.dirname(__FILE__) + '/../lib/calais'
|
6
|
+
|
7
|
+
FIXTURES_DIR = File.join File.dirname(__FILE__), %[fixtures]
|
8
|
+
SAMPLE_DOCUMENT = File.read(File.join(FIXTURES_DIR, %[bicycles_australia.xml]))
|
9
|
+
SAMPLE_RESPONSE = File.read(File.join(FIXTURES_DIR, %[bicycles_australia.response.rdf]))
|
10
|
+
SAMPLE_RESPONSE_WITH_NO_SCORE = File.read(File.join(FIXTURES_DIR, %[twitter_tweet_without_score.response.rdf]))
|
11
|
+
RESPONSE_WITH_EXCEPTION = File.read(File.join(FIXTURES_DIR, %[error.response.xml]))
|
12
|
+
LICENSE_ID = YAML.load(File.read(File.join(FIXTURES_DIR, %[calais.yml])))['key']
|
metadata
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sshingler-calais
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 9
|
9
|
+
version: 0.0.9
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Abhay Kumar
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2009-09-18 00:00:00 +01:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: nokogiri
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 3
|
30
|
+
- 3
|
31
|
+
version: 1.3.3
|
32
|
+
type: :runtime
|
33
|
+
version_requirements: *id001
|
34
|
+
- !ruby/object:Gem::Dependency
|
35
|
+
name: json
|
36
|
+
prerelease: false
|
37
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - ">="
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
segments:
|
42
|
+
- 1
|
43
|
+
- 1
|
44
|
+
- 3
|
45
|
+
version: 1.1.3
|
46
|
+
type: :runtime
|
47
|
+
version_requirements: *id002
|
48
|
+
- !ruby/object:Gem::Dependency
|
49
|
+
name: curb
|
50
|
+
prerelease: false
|
51
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
segments:
|
56
|
+
- 0
|
57
|
+
- 1
|
58
|
+
- 4
|
59
|
+
version: 0.1.4
|
60
|
+
type: :runtime
|
61
|
+
version_requirements: *id003
|
62
|
+
description: A Ruby interface to the Calais Web Service
|
63
|
+
email: info@opensynapse.net
|
64
|
+
executables: []
|
65
|
+
|
66
|
+
extensions: []
|
67
|
+
|
68
|
+
extra_rdoc_files:
|
69
|
+
- README.markdown
|
70
|
+
files:
|
71
|
+
- CHANGELOG.markdown
|
72
|
+
- MIT-LICENSE
|
73
|
+
- README.markdown
|
74
|
+
- Rakefile
|
75
|
+
- VERSION.yml
|
76
|
+
- lib/calais.rb
|
77
|
+
- lib/calais/client.rb
|
78
|
+
- lib/calais/error.rb
|
79
|
+
- lib/calais/response.rb
|
80
|
+
has_rdoc: true
|
81
|
+
homepage: http://github.com/abhay/calais
|
82
|
+
licenses: []
|
83
|
+
|
84
|
+
post_install_message:
|
85
|
+
rdoc_options:
|
86
|
+
- --charset=UTF-8
|
87
|
+
require_paths:
|
88
|
+
- lib
|
89
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
segments:
|
94
|
+
- 0
|
95
|
+
version: "0"
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
segments:
|
101
|
+
- 0
|
102
|
+
version: "0"
|
103
|
+
requirements: []
|
104
|
+
|
105
|
+
rubyforge_project: calais
|
106
|
+
rubygems_version: 1.3.6
|
107
|
+
signing_key:
|
108
|
+
specification_version: 2
|
109
|
+
summary: A Ruby interface to the Calais Web Service
|
110
|
+
test_files:
|
111
|
+
- spec/calais/client_spec.rb
|
112
|
+
- spec/calais/response_spec.rb
|
113
|
+
- spec/helper.rb
|