dover_to_calais 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/Rakefile +6 -1
- data/dover_to_calais.gemspec +4 -0
- data/features/data_mining.feature +10 -0
- data/features/data_sources.feature +23 -3
- data/features/filtering.feature +11 -5
- data/features/step_definitions/data_mining_steps.rb +32 -0
- data/features/step_definitions/data_sources_steps.rb +14 -4
- data/features/step_definitions/filtering_steps.rb +10 -6
- data/lib/dover_to_calais.rb +377 -45
- data/lib/dover_to_calais/models.rb +46 -0
- data/lib/dover_to_calais/ontology.rb +29 -22
- data/lib/dover_to_calais/version.rb +1 -1
- metadata +87 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MmI4YTNmZGZkYTViNDVlMzVmNGRiMGM3YTFlYzQ5YTc1ZTdmMDU3ZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
M2U0YjI4ZTJhM2E4M2U4MGM2N2E2OWZkYjUwYzExM2YyOWExZDYxNg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZjA0NDgwNjM4MzQxZDYxZmMwNjFmYzg2MTc4MjI1MDY0ZmU2ZGI2ZWZmNTY4
|
10
|
+
MjU5OTBhZWVkY2YyYzEyODhiOTI4NWI3NGQwZjI4NTg4ZWNiNmVlZWQwZjAx
|
11
|
+
NGJjYjc5YzdhMWJmN2UyODQ1YzM0MGUxY2VkOGU4YmQ5NjU3OTc=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
YjY3NDk2ODY1YjE4ZjM3NDMzZWQyMzc5Njc3YzRiODgyODYxNjJjMGY3YmUz
|
14
|
+
MDJiNzQzMTZhZjQxYWJjOWIxMjIyZjU3YjE3YmE5MDVmMjk2YjA3Y2QxZmQz
|
15
|
+
MmQ1YjY0ZTljNGQ0NGRkODU4NTg0ZWIzYzM4YWRmZGIxYTgxNDM=
|
data/Rakefile
CHANGED
@@ -3,5 +3,10 @@ require 'cucumber'
|
|
3
3
|
require 'cucumber/rake/task'
|
4
4
|
|
5
5
|
Cucumber::Rake::Task.new(:features) do |t|
|
6
|
-
|
6
|
+
if ENV['TAGS'].empty?
|
7
|
+
t.cucumber_opts = "features --format pretty API_KEY=#{ENV['API_KEY']}"
|
8
|
+
else
|
9
|
+
t.cucumber_opts = "features --format pretty API_KEY=#{ENV['API_KEY']} --tags #{ENV['TAGS']}"
|
10
|
+
end
|
11
|
+
|
7
12
|
end
|
data/dover_to_calais.gemspec
CHANGED
@@ -21,6 +21,10 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_runtime_dependency "eventmachine", "~> 1.0", ">= 1.0.3"
|
22
22
|
spec.add_runtime_dependency "em-http-request", "~> 1.1"
|
23
23
|
spec.add_runtime_dependency "yomu", "~> 0.1", ">= 0.1.9"
|
24
|
+
spec.add_runtime_dependency "json", "~> 1.5", ">= 1.5.5"
|
25
|
+
spec.add_runtime_dependency "ohm", "~> 2.0", ">= 2.0.0"
|
26
|
+
spec.add_runtime_dependency "ohm-contrib", "~> 2.0", ">= 2.0.0"
|
27
|
+
spec.add_runtime_dependency "em-throttled_queue", "~> 1.1", ">= 1.1.0"
|
24
28
|
|
25
29
|
|
26
30
|
spec.files = `git ls-files`.split($/)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
Feature: Ability to detect relationships between entities and events
|
2
|
+
|
3
|
+
Background:
|
4
|
+
Given the file 'test_file_1.txt' is successfully processed with the rich output
|
5
|
+
|
6
|
+
@rich_output
|
7
|
+
Scenario: Filter an entity with the rich output format
|
8
|
+
When I filter the response on {:entity => 'Event', :value => 'Meeting'}
|
9
|
+
Then The output should be an error
|
10
|
+
|
@@ -1,7 +1,11 @@
|
|
1
|
-
Feature: Able to handle wide range of data formats as input
|
2
|
-
|
1
|
+
Feature: Able to handle wide range of data formats as input
|
2
|
+
|
3
|
+
|
4
|
+
@simple_output
|
5
|
+
Scenario Outline: Processing various data-source formats (Simple Format)
|
3
6
|
Given the file <input>
|
4
|
-
When
|
7
|
+
When the Output format is set to 'Text/Simple'
|
8
|
+
And DoverToCalais processes this file
|
5
9
|
Then the output should have no errors
|
6
10
|
|
7
11
|
Examples:
|
@@ -12,3 +16,19 @@ Feature: Able to handle wide range of data formats as input
|
|
12
16
|
|test_file_1.pdf|
|
13
17
|
|test_file_1.rtf|
|
14
18
|
|test_file_1.txt|
|
19
|
+
|
20
|
+
@rich_output
|
21
|
+
Scenario Outline: Processing various data-source formats (Rich Format)
|
22
|
+
Given the file <input>
|
23
|
+
When the Output format is set to 'Application/JSON'
|
24
|
+
And DoverToCalais processes this file
|
25
|
+
Then the output should have no errors
|
26
|
+
|
27
|
+
Examples:
|
28
|
+
| input |
|
29
|
+
|test_file_1.doc |
|
30
|
+
|test_file_1.html|
|
31
|
+
|test_file_1.odt|
|
32
|
+
|test_file_1.pdf|
|
33
|
+
|test_file_1.rtf|
|
34
|
+
|test_file_1.txt|
|
data/features/filtering.feature
CHANGED
@@ -1,24 +1,30 @@
|
|
1
|
+
@simple_output
|
1
2
|
Feature: Ability to select certain OpenCalais entities based on certain conditions
|
2
3
|
|
3
4
|
Background:
|
4
|
-
Given the file 'test_file_1.txt' is successfully processed
|
5
|
+
Given the file 'test_file_1.txt' is successfully processed with the simple output
|
5
6
|
|
6
7
|
|
7
|
-
Scenario:
|
8
|
+
Scenario: Filter all entities with a specific name
|
8
9
|
When I filter on {:entity => 'EmailAddress'}
|
9
10
|
Then the output should have 2 entries
|
10
11
|
And All entries should be named 'EmailAddress'
|
11
12
|
|
12
|
-
Scenario:
|
13
|
+
Scenario: Filter an entity with a specific value
|
13
14
|
When I filter on {:entity => 'Event', :value => 'Meeting'}
|
14
15
|
Then the output should have 1 entries
|
15
16
|
And All entries should be named 'Event'
|
16
17
|
And All entries should have the value 'Meeting'
|
17
18
|
|
18
19
|
|
19
|
-
Scenario:
|
20
|
+
Scenario: Filter an entity only if another entity with a specific value exists in the data source
|
20
21
|
When I filter on {:entity => 'Person', :given => {:entity => 'Event', :value => 'Meeting'}}
|
21
22
|
Then the output should have 2 entries
|
22
23
|
And All entries should be named 'Person'
|
23
24
|
And One entry should have the value 'Roger Kay'
|
24
|
-
And One entry should have the value 'David Bailey'
|
25
|
+
And One entry should have the value 'David Bailey'
|
26
|
+
|
27
|
+
Scenario: Filter all entities with a specific name
|
28
|
+
When I filter on {:entity => 'EmailAddress'}
|
29
|
+
Then the output should have 2 entries
|
30
|
+
And All entries should be named 'EmailAddress'
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'eventmachine'
|
2
|
+
require 'em-http-request'
|
3
|
+
require 'yomu'
|
4
|
+
require 'rspec'
|
5
|
+
require_relative '../../lib/dover_to_calais'
|
6
|
+
#require_relative './filtering_steps.rb'
|
7
|
+
|
8
|
+
|
9
|
+
# N.B Cucumber must be run with the Environment variable 'API_KEY' set
|
10
|
+
# to the OpenCalais API Key value.
|
11
|
+
|
12
|
+
Given(/^the file '(\w+\.\w{3,4})' is successfully processed with the rich output$/) do |file|
|
13
|
+
|
14
|
+
steps %{
|
15
|
+
Given the file #{file}
|
16
|
+
When the Output format is set to 'Application/JSON'
|
17
|
+
And DoverToCalais processes this file
|
18
|
+
Then the output should have no errors
|
19
|
+
}
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
When(/^I filter the response on ({.+})$/) do |f|
|
24
|
+
@filtered_output = @output.filter(eval(f))
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
Then(/^The output should be an error$/) do
|
29
|
+
@filtered_output.match(/^ERR:\s/).should_not be_nil
|
30
|
+
end
|
31
|
+
|
32
|
+
|
@@ -4,27 +4,28 @@ require 'eventmachine'
|
|
4
4
|
require 'em-http-request'
|
5
5
|
require 'yomu'
|
6
6
|
require 'rspec'
|
7
|
-
|
8
|
-
|
7
|
+
require_relative '../../lib/dover_to_calais'
|
9
8
|
|
10
9
|
# N.B Cucumber must be run with the Environment variable 'API_KEY' set
|
11
10
|
# to the OpenCalais API Key value.
|
12
11
|
|
13
12
|
|
13
|
+
|
14
14
|
Given(/^the file (\w+\.\w{3,4})$/) do |arg1|
|
15
15
|
puts arg1
|
16
16
|
@input = Dir.pwd + '/test/' + arg1
|
17
17
|
@output = nil
|
18
|
+
|
18
19
|
end
|
19
20
|
|
20
21
|
|
21
22
|
|
22
23
|
When(/^DoverToCalais processes this file$/) do
|
23
24
|
EM.run {
|
24
|
-
|
25
25
|
DoverToCalais::API_KEY = ENV['API_KEY']
|
26
|
+
|
26
27
|
d1 = DoverToCalais::Dover.new(@input)
|
27
|
-
d1.analyse_this
|
28
|
+
d1.analyse_this(@output_format)
|
28
29
|
d1.to_calais do |response|
|
29
30
|
@output = response
|
30
31
|
EM.stop
|
@@ -33,6 +34,15 @@ When(/^DoverToCalais processes this file$/) do
|
|
33
34
|
}
|
34
35
|
end
|
35
36
|
|
37
|
+
When(/^the Output format is set to 'Text\/Simple'$/) do
|
38
|
+
@output_format = nil
|
39
|
+
end
|
40
|
+
|
41
|
+
When(/^the Output format is set to 'Application\/JSON'$/) do
|
42
|
+
@output_format = :rich
|
43
|
+
end
|
44
|
+
|
45
|
+
|
36
46
|
|
37
47
|
|
38
48
|
Then(/^the output should have no errors$/) do
|
@@ -3,27 +3,31 @@ require 'eventmachine'
|
|
3
3
|
require 'em-http-request'
|
4
4
|
require 'yomu'
|
5
5
|
require 'rspec'
|
6
|
-
require File.expand_path('../../../lib/dover_to_calais', __FILE__)
|
7
|
-
|
6
|
+
#require File.expand_path('../../../lib/dover_to_calais', __FILE__)
|
7
|
+
require_relative '../../lib/dover_to_calais'
|
8
8
|
|
9
9
|
# N.B Cucumber must be run with the Environment variable 'API_KEY' set
|
10
10
|
# to the OpenCalais API Key value.
|
11
11
|
|
12
12
|
|
13
13
|
|
14
|
-
Given(/^the file '(\w+\.\w{3,4})' is successfully processed$/) do |file|
|
14
|
+
Given(/^the file '(\w+\.\w{3,4})' is successfully processed with the simple output$/) do |file|
|
15
15
|
|
16
16
|
steps %{
|
17
17
|
Given the file #{file}
|
18
|
-
When
|
18
|
+
When the Output format is set to 'Text/Simple'
|
19
|
+
And DoverToCalais processes this file
|
19
20
|
Then the output should have no errors
|
20
21
|
}
|
21
22
|
|
22
23
|
end
|
23
24
|
|
24
25
|
|
25
|
-
|
26
|
-
|
26
|
+
|
27
|
+
|
28
|
+
|
29
|
+
When(/^I filter on ({.+})$/) do |f|
|
30
|
+
@filtered_output = @output.filter(eval(f))
|
27
31
|
|
28
32
|
end
|
29
33
|
|
data/lib/dover_to_calais.rb
CHANGED
@@ -5,12 +5,22 @@ require 'nokogiri'
|
|
5
5
|
require 'eventmachine'
|
6
6
|
require 'em-http-request'
|
7
7
|
require 'yomu'
|
8
|
+
require 'json'
|
9
|
+
require "dover_to_calais/models" #gem lib file
|
10
|
+
require 'ohm'
|
8
11
|
|
9
12
|
|
10
13
|
module DoverToCalais
|
11
14
|
|
12
15
|
|
13
16
|
PROXY = nil
|
17
|
+
REDIS = "redis://127.0.0.1:6379/6"
|
18
|
+
|
19
|
+
def self.flushdb
|
20
|
+
Ohm.redis = Redic.new(REDIS)
|
21
|
+
Ohm.redis.call "FLUSHDB"
|
22
|
+
end
|
23
|
+
|
14
24
|
|
15
25
|
# The ResponseItem structure holds all potential text and attribute values of an OpenCalais
|
16
26
|
# XML Simple format element.
|
@@ -66,65 +76,361 @@ module DoverToCalais
|
|
66
76
|
# nil if none occurred
|
67
77
|
#
|
68
78
|
class ResponseData
|
69
|
-
attr_reader :error
|
70
79
|
|
80
|
+
|
81
|
+
|
82
|
+
class Entity< Struct.new(:type, :name, :ref)
|
83
|
+
|
84
|
+
def to_hash
|
85
|
+
a_hash = {}
|
86
|
+
self.each_pair do |attr, value|
|
87
|
+
a_hash[attr] = value
|
88
|
+
end
|
89
|
+
a_hash
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
class GenericRelation< Struct.new(:subject, :verb, :object, :detection)
|
95
|
+
|
96
|
+
end #class
|
97
|
+
|
98
|
+
class Event
|
99
|
+
|
100
|
+
attr_reader :entities
|
101
|
+
|
102
|
+
def initialize(events_hash)
|
103
|
+
# @entities = entity_hash
|
104
|
+
events_hash.each do |k,v|
|
105
|
+
unless k.eql?('_typeGroup') || k.eql?('instances') || k.eql?('_typeReference')
|
106
|
+
k = 'type' if k.eql?('_type') #don't like the underscore
|
107
|
+
## create and initialize an instance variable for this key/value pair
|
108
|
+
self.instance_variable_set("@#{k}", v)
|
109
|
+
## create the getter that returns the instance variable
|
110
|
+
self.class.send(:define_method, k, proc{self.instance_variable_get("@#{k}")})
|
111
|
+
## create the setter that sets the instance variable
|
112
|
+
self.class.send(:define_method, "#{k}=", proc{|v| self.instance_variable_set("@#{k}", v)})
|
113
|
+
end
|
114
|
+
end #block
|
115
|
+
end #method
|
116
|
+
|
117
|
+
def each_pair
|
118
|
+
self.instance_variables.each do |a|
|
119
|
+
yield a, self.instance_variable_get(a)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def [](attrib_name)
|
124
|
+
self.instance_variables.each do |a|
|
125
|
+
if "@#{a}" == attrib_name
|
126
|
+
self.instance_variable_get(a)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
public :each_pair
|
132
|
+
|
133
|
+
end #class
|
134
|
+
|
135
|
+
|
136
|
+
|
137
|
+
|
138
|
+
attr_reader :error, :entities_store, :events_store, :generic_relations_store, :freds
|
71
139
|
# creates a new ResponseData object, passing the name of the data source to be processed
|
72
140
|
#
|
73
|
-
# @param
|
141
|
+
# @param response_data [ Nokogiri::XML::NodeSet, Hash, nil] the XML or JSON data returned by OpenCalais
|
74
142
|
# @param error [ String, nil] an error description if the OpenCalais call has failed
|
75
|
-
def initialize(
|
76
|
-
if
|
77
|
-
@
|
143
|
+
def initialize(response_data = nil, error = nil)
|
144
|
+
if response_data.class.to_s == "Nokogiri::XML::Document"
|
145
|
+
@xml_data = response_data
|
146
|
+
elsif response_data.class.to_s == "Hash"
|
147
|
+
@json_data = response_data
|
148
|
+
prepare_data(response_data)
|
149
|
+
|
150
|
+
|
78
151
|
else
|
79
152
|
@error = error
|
80
153
|
end
|
154
|
+
|
81
155
|
end
|
82
156
|
|
83
157
|
# Returns the response data as an XML string or an error, if one has occurred.
|
84
158
|
#
|
85
159
|
# @return [String] an XML string
|
86
160
|
def to_s
|
87
|
-
|
161
|
+
if @xml_data
|
162
|
+
@xml_data.to_s
|
163
|
+
elsif @json_data
|
164
|
+
@json_data.to_s
|
165
|
+
else
|
166
|
+
@error
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
|
171
|
+
|
172
|
+
|
173
|
+
# The method will first create three Hash instance variables, where it will store the
|
174
|
+
# Entities, Generic Relations and Events -respectively- from the OpenCalais response.
|
175
|
+
# The key on each Hash instance variable will be the OpenCalais ID and the value will
|
176
|
+
# be the values_hash for that ID.
|
177
|
+
# Secondly, the method will iterate through each Entity, find all of it's related
|
178
|
+
# Relations and Events and store them -in a relational manner- in Redis, via Ohm.
|
179
|
+
#
|
180
|
+
# Only applicable with the JSON (rich) output format
|
181
|
+
#
|
182
|
+
# @param Hash the OpenCalais JSON response, as a Hash
|
183
|
+
# @return an GenericRelation Struct if a match is found, nil otherwise
|
184
|
+
|
185
|
+
def prepare_data(results_hash)
|
186
|
+
|
187
|
+
@entities_store = {}
|
188
|
+
@generic_relations_store = {}
|
189
|
+
@events_store = {}
|
190
|
+
# find all Entities in response
|
191
|
+
@entities_store = results_hash.select{|key, hash| hash["_typeGroup"] == "entities"}
|
192
|
+
# find all GenericRelations in response
|
193
|
+
@generic_relations_store = results_hash.select{|key, hash| hash["_typeGroup"] == "relations" &&
|
194
|
+
hash["_type"] == "GenericRelations"}
|
195
|
+
# find all Events in response
|
196
|
+
@events_store = results_hash.select{|key, hash| hash["_typeGroup"] == "relations" &&
|
197
|
+
hash["_type"] != "GenericRelations"}
|
198
|
+
|
199
|
+
Ohm.redis = Redic.new(REDIS)
|
200
|
+
|
201
|
+
|
202
|
+
#for each Entity find all related Relations and Events and store them to Ohm/Redis
|
203
|
+
@entities_store.each_pair do |k, v|
|
204
|
+
|
205
|
+
entity_set = EntityModel.find(calais_id: k)
|
206
|
+
|
207
|
+
if entity_set.size > 0 #entity already exists in store
|
208
|
+
entity = entity_set.first
|
209
|
+
k = entity.calais_id
|
210
|
+
else #entity doesn't exist in store
|
211
|
+
entity = EntityModel.create(:name => v['name'], :type => v['_type'], :calais_id => k)
|
212
|
+
entity.save
|
213
|
+
end #if
|
214
|
+
|
215
|
+
|
216
|
+
#get all referenced relations
|
217
|
+
find_in_relations(k).each do |obj|
|
218
|
+
|
219
|
+
found_rel = get_relation(obj[0])
|
220
|
+
if found_rel
|
221
|
+
|
222
|
+
found_rel.subject = convert_to_hash(found_rel.subject)
|
223
|
+
found_rel.object = convert_to_hash(found_rel.object)
|
224
|
+
|
225
|
+
relation = EntityModel::RelationModel.create(:subject => found_rel.subject,
|
226
|
+
:object => found_rel.object,
|
227
|
+
:verb => found_rel.verb,
|
228
|
+
:detection => found_rel.detection,
|
229
|
+
:calais_id => obj[0])
|
230
|
+
entity.relations.add(relation)
|
231
|
+
end #if
|
232
|
+
end #each
|
233
|
+
#get all referenced events
|
234
|
+
find_in_events(k).each do |obj|
|
235
|
+
found_event = get_event(obj[0])
|
236
|
+
attribs = {}
|
237
|
+
if found_event
|
238
|
+
|
239
|
+
found_event.each_pair do |key, val|
|
240
|
+
|
241
|
+
key = key.to_s.slice(1, key.length-1)
|
242
|
+
attribs[key] = val
|
243
|
+
|
244
|
+
end #block
|
245
|
+
|
246
|
+
event = EntityModel::EventModel.create(:calais_id => obj[0], :info_hash => attribs)
|
247
|
+
entity.events.add(event)
|
248
|
+
|
249
|
+
end #if
|
250
|
+
|
251
|
+
end #each
|
252
|
+
end #each_pair
|
253
|
+
end #method
|
254
|
+
|
255
|
+
|
256
|
+
|
257
|
+
# Coverts an attribute to an appropriate Hash
|
258
|
+
#
|
259
|
+
# Only applicable with the JSON (rich) output format
|
260
|
+
#
|
261
|
+
# @param [String, DoverToCalais::ResponseData::Entity, Hash] an object
|
262
|
+
# @return a Hash value
|
263
|
+
def convert_to_hash(an_attribute)
|
264
|
+
h = {}
|
265
|
+
if an_attribute.class.to_s.eql?('String')
|
266
|
+
h[:name] = an_attribute
|
267
|
+
end
|
268
|
+
|
269
|
+
if an_attribute.class.to_s.eql?('DoverToCalais::ResponseData::Entity')
|
270
|
+
h = an_attribute.to_hash
|
271
|
+
end
|
272
|
+
|
273
|
+
if an_attribute.class.to_s.eql?('Hash')
|
274
|
+
h = an_attribute
|
275
|
+
end
|
276
|
+
|
277
|
+
h
|
278
|
+
end #method
|
279
|
+
|
280
|
+
|
281
|
+
|
282
|
+
# Retrieves the entity with the specified key (OpenCalais ID)
|
283
|
+
#
|
284
|
+
# Only applicable with the JSON (rich) output format
|
285
|
+
#
|
286
|
+
# @param String the OpenCalais ID
|
287
|
+
# @return an Entity Struct if a match is found, nil otherwise
|
288
|
+
def get_entity(ref_key)
|
289
|
+
if @entities_store.has_key?(ref_key)
|
290
|
+
Entity.new(@entities_store[ref_key]['_type'], @entities_store[ref_key]['name'], ref_key)
|
291
|
+
else
|
292
|
+
nil
|
293
|
+
end
|
88
294
|
end
|
89
295
|
|
90
296
|
|
91
|
-
#
|
297
|
+
# Retrieves the relation with the specified key (OpenCalais ID). The method will also
|
298
|
+
# de-reference any of its attributes that refer to other entities via an OpenCalais ID
|
299
|
+
# and will replace the references with the appropriate Entity structure, if applicable
|
300
|
+
#
|
301
|
+
# Only applicable with the JSON (rich) output format
|
302
|
+
#
|
303
|
+
# @param String the OpenCalais ID
|
304
|
+
# @return an GenericRelation Struct if a match is found, nil otherwise
|
305
|
+
def get_relation(ref_key)
|
306
|
+
if @generic_relations_store.key?(ref_key)
|
307
|
+
|
308
|
+
if @generic_relations_store[ref_key]['relationsubject']
|
309
|
+
gr_subject = @generic_relations_store[ref_key]['relationsubject'].match('^http://d.opencalais.com') ?
|
310
|
+
get_entity(@generic_relations_store[ref_key]['relationsubject']) :
|
311
|
+
@generic_relations_store[ref_key]['relationsubject']
|
312
|
+
else
|
313
|
+
gr_subject = 'N/A'
|
314
|
+
end
|
315
|
+
|
316
|
+
|
317
|
+
if @generic_relations_store[ref_key]['relationobject']
|
318
|
+
gr_object = @generic_relations_store[ref_key]['relationobject'].match('^http://d.opencalais.com') ?
|
319
|
+
get_entity(@generic_relations_store[ref_key]['relationobject']) :
|
320
|
+
@generic_relations_store[ref_key]['relationobject']
|
321
|
+
else
|
322
|
+
gr_object = 'N/A'
|
323
|
+
end
|
324
|
+
|
325
|
+
GenericRelation.new(gr_subject,
|
326
|
+
@generic_relations_store[ref_key]['verb'],
|
327
|
+
gr_object,
|
328
|
+
@generic_relations_store[ref_key]['instances'][0]['exact'] ||= 'N/A')
|
329
|
+
else
|
330
|
+
nil
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
def get_event(ref_key)
|
335
|
+
|
336
|
+
dereferenced_events = {}
|
337
|
+
|
338
|
+
if @events_store.key?(ref_key)
|
339
|
+
|
340
|
+
@events_store[ref_key].each do |k, v|
|
341
|
+
|
342
|
+
if v.class.to_s.eql?("String") && v.match('^http://d.opencalais.com')
|
343
|
+
dereferenced_events[k] = get_entity(v).to_hash
|
344
|
+
elsif v.class.to_s.eql?("String") && !v.match('^http://d.opencalais.com')
|
345
|
+
h = {}
|
346
|
+
h['name'] = v
|
347
|
+
dereferenced_events[k] = h
|
348
|
+
elsif v.class.to_s.eql?("Array")
|
349
|
+
h = {}
|
350
|
+
h['name'] = v[0]['exact']
|
351
|
+
dereferenced_events[k] = h
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
Event.new(dereferenced_events)
|
356
|
+
else
|
357
|
+
nil
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
|
362
|
+
# Selects a Hash of generic relations, where the relations' subject or object attributes
|
363
|
+
# match the specified OpenCalais ID.
|
364
|
+
#
|
365
|
+
# Only applicable with the JSON (rich) output format
|
366
|
+
#
|
367
|
+
# @param String the OpenCalais ID
|
368
|
+
# @return a Hash with the selected matches
|
369
|
+
def find_in_relations(ref_key)
|
370
|
+
@generic_relations_store.select{|key, hash| (hash["relationsubject"] == ref_key) ||
|
371
|
+
(hash["relationobject"] == ref_key) }
|
372
|
+
|
373
|
+
end
|
374
|
+
|
375
|
+
# Selects a Hash of events, where the events' key matches the
|
376
|
+
# specified OpenCalais ID.
|
377
|
+
#
|
378
|
+
# Only applicable with the JSON (rich) output format
|
379
|
+
#
|
380
|
+
# @param String the OpenCalais ID
|
381
|
+
# @return a Hash with the selected matches
|
382
|
+
def find_in_events(ref_key)
|
383
|
+
@events_store.select{|key, hash| hash.has_value?(ref_key) }
|
384
|
+
end
|
385
|
+
|
386
|
+
|
387
|
+
|
388
|
+
# Filters the xml response object to extract relevant data.
|
92
389
|
#
|
93
390
|
# @param params [Hash] a filter Hash (see code samples)
|
94
391
|
# @return [Array[ResponseItem]] a list of relevant response items
|
95
392
|
def filter(params)
|
393
|
+
unless @xml_data
|
394
|
+
return 'ERR: filter method only works with xml-based output!'
|
395
|
+
|
396
|
+
end
|
397
|
+
|
96
398
|
result = Array.new
|
97
399
|
begin
|
98
|
-
if @
|
400
|
+
if @xml_data
|
99
401
|
|
100
402
|
if params[:given]
|
101
|
-
found = @
|
403
|
+
found = @xml_data.xpath("//#{params[:given][:entity]}[contains(text(), #{params[:given][:value].inspect})]")
|
102
404
|
if found.size > 0
|
103
|
-
@
|
405
|
+
@xml_data.xpath("//#{params[:entity]}[contains(text(), #{params[:value].inspect})]").each do |node|
|
104
406
|
result << create_response_item(node)
|
105
407
|
end
|
106
408
|
end
|
107
409
|
else # no conditional
|
108
|
-
@
|
410
|
+
@xml_data.xpath("//#{params[:entity]}[contains(text(), #{params[:value].inspect})]").each do |node|
|
109
411
|
result << create_response_item(node)
|
110
412
|
end
|
111
413
|
end
|
112
414
|
|
113
415
|
return result
|
114
416
|
else # no xml data
|
115
|
-
|
417
|
+
return 'ERR: no valid xml data!'
|
116
418
|
|
117
419
|
end #if
|
118
420
|
|
119
421
|
rescue Exception=>e
|
120
|
-
|
422
|
+
return "ERR: #filter: #{e}"
|
121
423
|
|
122
424
|
end
|
123
425
|
|
124
|
-
|
426
|
+
return result
|
125
427
|
|
126
428
|
end #method
|
127
429
|
|
430
|
+
# Creates a Response Item from an xml node.
|
431
|
+
#
|
432
|
+
# @param node [Nokogiri::XML::Node] an XML node
|
433
|
+
# @return [ResponseItem] a response item object
|
128
434
|
def create_response_item(node)
|
129
435
|
node_relevance = node.attribute('relevance').text.to_f if node.has_attribute?('relevance')
|
130
436
|
node_count = node.attribute('count').text.to_i if node.has_attribute?('count')
|
@@ -133,17 +439,18 @@ module DoverToCalais
|
|
133
439
|
node_orig_value = node.xpath('originalValue').text if node.name.eql?('SocialTag')
|
134
440
|
|
135
441
|
ResponseItem.new(node.name,
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
442
|
+
node.text,
|
443
|
+
node_relevance,
|
444
|
+
node_count,
|
445
|
+
node_normalized,
|
446
|
+
node_importance,
|
447
|
+
node_orig_value )
|
142
448
|
|
143
449
|
end
|
144
450
|
|
145
451
|
public :filter
|
146
|
-
private :create_response_item
|
452
|
+
private :create_response_item, :prepare_data, :convert_to_hash, :find_in_relations, :find_in_events,
|
453
|
+
:get_event, :get_entity
|
147
454
|
|
148
455
|
end #class
|
149
456
|
|
@@ -211,13 +518,23 @@ module DoverToCalais
|
|
211
518
|
|
212
519
|
end #method
|
213
520
|
|
521
|
+
|
522
|
+
|
523
|
+
|
524
|
+
|
214
525
|
# Gets the source text parsed. If the parsing is successful, the data source is POSTed to OpenCalais
|
215
526
|
# via an EventMachine request and a callback is set to manage the OpenCalais response.
|
216
|
-
# All Dover object callbacks are then called with the request result yielded to them.
|
527
|
+
# All Dover object callbacks are then called with the request result yielded to them.
|
217
528
|
#
|
218
529
|
# @param N/A
|
219
530
|
# @return a {Class ResponseData} object
|
220
|
-
def analyse_this
|
531
|
+
def analyse_this(output_format=nil)
|
532
|
+
|
533
|
+
if output_format
|
534
|
+
@output_format = 'application/json'
|
535
|
+
else
|
536
|
+
@output_format = 'Text/Simple'
|
537
|
+
end
|
221
538
|
|
222
539
|
@document = get_src_data(@data_src)
|
223
540
|
begin
|
@@ -230,19 +547,19 @@ module DoverToCalais
|
|
230
547
|
|
231
548
|
|
232
549
|
if DoverToCalais::PROXY &&
|
233
|
-
|
234
|
-
|
550
|
+
DoverToCalais::PROXY.class.eql?('Hash') &&
|
551
|
+
DoverToCalais::PROXY.keys[0].eql?(:proxy)
|
235
552
|
|
236
553
|
connection_options = connection_options.merge(DoverToCalais::PROXY)
|
237
554
|
end
|
238
555
|
|
239
556
|
request_options = {
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
557
|
+
:body => @document.to_s,
|
558
|
+
:head => {
|
559
|
+
'x-calais-licenseID' => DoverToCalais::API_KEY,
|
560
|
+
:content_type => 'TEXT/RAW',
|
561
|
+
:enableMetadataType => 'GenericRelations,SocialTags',
|
562
|
+
:outputFormat => @output_format}
|
246
563
|
}
|
247
564
|
|
248
565
|
http = EventMachine::HttpRequest.new(CALAIS_SERVICE, connection_options ).post request_options
|
@@ -251,20 +568,34 @@ module DoverToCalais
|
|
251
568
|
http.callback do
|
252
569
|
|
253
570
|
if http.response_header.status == 200
|
254
|
-
|
255
|
-
response
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
571
|
+
if @output_format == 'Text/Simple'
|
572
|
+
http.response.match(/<OpenCalaisSimple>/) do |m|
|
573
|
+
response = Nokogiri::XML('<OpenCalaisSimple>' + m.post_match) do |config|
|
574
|
+
#strict xml parsing, disallow network connections
|
575
|
+
config.strict.nonet
|
576
|
+
end #block
|
577
|
+
end
|
578
|
+
else #@output_format == 'application/json'
|
579
|
+
response = JSON.parse(http.response) #response should now be a Hash
|
580
|
+
|
581
|
+
end #if
|
582
|
+
|
583
|
+
case response.class.to_s
|
584
|
+
when 'NilClass'
|
585
|
+
result = ResponseData.new(nil,'ERR: cannot parse response data - source invalid?')
|
586
|
+
when 'Nokogiri::XML::Document'
|
587
|
+
result = ResponseData.new(response, nil)
|
588
|
+
when 'Hash'
|
589
|
+
result = ResponseData.new(response, nil)
|
590
|
+
else
|
591
|
+
result = ResponseData.new(nil,'ERR: cannot parse response data - unrecognized format!')
|
592
|
+
end
|
593
|
+
|
594
|
+
|
595
|
+
else #non-200 response
|
596
|
+
result = ResponseData.new nil,
|
597
|
+
"ERR: OpenCalais service responded with #{http.response_header.status} - response body: '#{http.response}'"
|
598
|
+
end
|
268
599
|
|
269
600
|
@callbacks.each { |c| c.call(result) }
|
270
601
|
|
@@ -287,6 +618,7 @@ module DoverToCalais
|
|
287
618
|
end #method
|
288
619
|
|
289
620
|
|
621
|
+
|
290
622
|
alias_method :analyze_this, :analyse_this
|
291
623
|
public :to_calais, :analyse_this
|
292
624
|
private :get_src_data
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'ohm'
|
2
|
+
require 'ohm/contrib'
|
3
|
+
|
4
|
+
module DoverToCalais
|
5
|
+
class EntityModel < Ohm::Model
|
6
|
+
attribute :name
|
7
|
+
attribute :type
|
8
|
+
attribute :calais_id
|
9
|
+
set :relations, :RelationModel
|
10
|
+
set :events, :EventModel
|
11
|
+
|
12
|
+
|
13
|
+
index :name
|
14
|
+
index :type
|
15
|
+
index :calais_id
|
16
|
+
|
17
|
+
def validate
|
18
|
+
assert_present :name
|
19
|
+
assert_present :type
|
20
|
+
assert_present :calais_id
|
21
|
+
end
|
22
|
+
|
23
|
+
class RelationModel < Ohm::Model
|
24
|
+
include Ohm::DataTypes
|
25
|
+
|
26
|
+
attribute :subject, Type::Hash
|
27
|
+
attribute :object, Type::Hash
|
28
|
+
attribute :verb
|
29
|
+
attribute :detection
|
30
|
+
attribute :calais_id
|
31
|
+
|
32
|
+
index :subject
|
33
|
+
|
34
|
+
end #class
|
35
|
+
|
36
|
+
class EventModel < Ohm::Model
|
37
|
+
include Ohm::DataTypes
|
38
|
+
|
39
|
+
attribute :calais_id
|
40
|
+
attribute :info_hash, Type::Hash
|
41
|
+
|
42
|
+
end #class
|
43
|
+
|
44
|
+
end #class
|
45
|
+
|
46
|
+
end
|
@@ -1,23 +1,3 @@
|
|
1
|
-
# ontology.rb
|
2
|
-
#
|
3
|
-
# Copyright 2013 Fred <fred@fred-Veriton-X270>
|
4
|
-
#
|
5
|
-
# This program is free software; you can redistribute it and/or modify
|
6
|
-
# it under the terms of the GNU General Public License as published by
|
7
|
-
# the Free Software Foundation; either version 2 of the License, or
|
8
|
-
# (at your option) any later version.
|
9
|
-
#
|
10
|
-
# This program is distributed in the hope that it will be useful,
|
11
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13
|
-
# GNU General Public License for more details.
|
14
|
-
#
|
15
|
-
# You should have received a copy of the GNU General Public License
|
16
|
-
# along with this program; if not, write to the Free Software
|
17
|
-
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
18
|
-
# MA 02110-1301, USA.
|
19
|
-
#
|
20
|
-
#
|
21
1
|
|
22
2
|
|
23
3
|
module CalaisOntology
|
@@ -67,25 +47,38 @@ CALAIS_EVENTS = %w(
|
|
67
47
|
Alliance
|
68
48
|
AnalystEarningsEstimate
|
69
49
|
AnalystRecommendation
|
50
|
+
ArmedAttack
|
51
|
+
ArmsPurchaseSale
|
70
52
|
Arrest
|
71
53
|
Bankruptcy
|
72
54
|
BonusSharesIssuance
|
73
55
|
BusinessRelation
|
74
56
|
Buybacks
|
57
|
+
CandidatePosition
|
75
58
|
CompanyAccountingChange
|
59
|
+
CompanyAffiliates
|
60
|
+
CompanyCompetitor
|
61
|
+
CompanyCustomer
|
76
62
|
CompanyEarningsAnnouncement
|
77
63
|
CompanyEarningsGuidance
|
64
|
+
CompanyEmployeesNumber
|
78
65
|
CompanyExpansion
|
79
66
|
CompanyForceMajeure
|
67
|
+
CompanyFounded
|
80
68
|
CompanyInvestment
|
81
69
|
CompanyLaborIssues
|
82
70
|
CompanyLayoffs
|
83
71
|
CompanyLegalIssues
|
72
|
+
CompanyLocation
|
84
73
|
CompanyListingChange
|
85
74
|
CompanyMeeting
|
86
75
|
CompanyNameChange
|
76
|
+
CompanyProduct
|
87
77
|
CompanyReorganization
|
88
78
|
CompanyRestatement
|
79
|
+
CompanyTechnology
|
80
|
+
CompanyUsingProduct
|
81
|
+
CompanyTicker
|
89
82
|
ConferenceCall
|
90
83
|
Conviction
|
91
84
|
CreditRating
|
@@ -94,19 +87,32 @@ CALAIS_EVENTS = %w(
|
|
94
87
|
DiplomaticRelations
|
95
88
|
Dividend
|
96
89
|
EmploymentChange
|
90
|
+
EmploymentRelation
|
97
91
|
EnvironmentalIssue
|
92
|
+
EquityFinancing
|
98
93
|
Extinction
|
94
|
+
FamilyRelation
|
99
95
|
FDAPhase
|
100
96
|
Indictment
|
97
|
+
IndicesChanges
|
101
98
|
IPO
|
102
99
|
JointVenture
|
103
100
|
ManMadeDisaster
|
104
101
|
Merger
|
102
|
+
MilitaryAction
|
105
103
|
MovieRelease
|
106
104
|
MusicAlbumRelease
|
107
105
|
NaturalDisaster
|
108
106
|
PatentFiling
|
109
107
|
PatentIssuance
|
108
|
+
PersonAttributes
|
109
|
+
PersonCareer
|
110
|
+
PersonCommunication
|
111
|
+
PersonEducation
|
112
|
+
PersonEmailAddress
|
113
|
+
PersonLocation
|
114
|
+
PersonParty
|
115
|
+
PersonRelation
|
110
116
|
PersonTravel
|
111
117
|
PoliticalEndorsement
|
112
118
|
PoliticalRelationship
|
@@ -114,14 +120,15 @@ CALAIS_EVENTS = %w(
|
|
114
120
|
ProductIssues
|
115
121
|
ProductRecall
|
116
122
|
ProductRelease
|
117
|
-
|
123
|
+
Quotation
|
118
124
|
SecondaryIssuance
|
119
|
-
PersonCommunication
|
120
125
|
StockSplit
|
121
126
|
Trial
|
122
127
|
VotingResult
|
123
128
|
)
|
124
129
|
|
130
|
+
|
131
|
+
|
125
132
|
CALAIS_TOPICS = %w(
|
126
133
|
Business_Finance
|
127
134
|
Disaster_Accident
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dover_to_calais
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Fred Heath
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -78,6 +78,86 @@ dependencies:
|
|
78
78
|
- - ! '>='
|
79
79
|
- !ruby/object:Gem::Version
|
80
80
|
version: 0.1.9
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: json
|
83
|
+
requirement: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - ~>
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '1.5'
|
88
|
+
- - ! '>='
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: 1.5.5
|
91
|
+
type: :runtime
|
92
|
+
prerelease: false
|
93
|
+
version_requirements: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ~>
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '1.5'
|
98
|
+
- - ! '>='
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 1.5.5
|
101
|
+
- !ruby/object:Gem::Dependency
|
102
|
+
name: ohm
|
103
|
+
requirement: !ruby/object:Gem::Requirement
|
104
|
+
requirements:
|
105
|
+
- - ~>
|
106
|
+
- !ruby/object:Gem::Version
|
107
|
+
version: '2.0'
|
108
|
+
- - ! '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 2.0.0
|
111
|
+
type: :runtime
|
112
|
+
prerelease: false
|
113
|
+
version_requirements: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ~>
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.0'
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: 2.0.0
|
121
|
+
- !ruby/object:Gem::Dependency
|
122
|
+
name: ohm-contrib
|
123
|
+
requirement: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
125
|
+
- - ~>
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '2.0'
|
128
|
+
- - ! '>='
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: 2.0.0
|
131
|
+
type: :runtime
|
132
|
+
prerelease: false
|
133
|
+
version_requirements: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - ~>
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '2.0'
|
138
|
+
- - ! '>='
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: 2.0.0
|
141
|
+
- !ruby/object:Gem::Dependency
|
142
|
+
name: em-throttled_queue
|
143
|
+
requirement: !ruby/object:Gem::Requirement
|
144
|
+
requirements:
|
145
|
+
- - ~>
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '1.1'
|
148
|
+
- - ! '>='
|
149
|
+
- !ruby/object:Gem::Version
|
150
|
+
version: 1.1.0
|
151
|
+
type: :runtime
|
152
|
+
prerelease: false
|
153
|
+
version_requirements: !ruby/object:Gem::Requirement
|
154
|
+
requirements:
|
155
|
+
- - ~>
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '1.1'
|
158
|
+
- - ! '>='
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: 1.1.0
|
81
161
|
- !ruby/object:Gem::Dependency
|
82
162
|
name: bundler
|
83
163
|
requirement: !ruby/object:Gem::Requirement
|
@@ -164,11 +244,14 @@ files:
|
|
164
244
|
- README.md
|
165
245
|
- Rakefile
|
166
246
|
- dover_to_calais.gemspec
|
247
|
+
- features/data_mining.feature
|
167
248
|
- features/data_sources.feature
|
168
249
|
- features/filtering.feature
|
250
|
+
- features/step_definitions/data_mining_steps.rb
|
169
251
|
- features/step_definitions/data_sources_steps.rb
|
170
252
|
- features/step_definitions/filtering_steps.rb
|
171
253
|
- lib/dover_to_calais.rb
|
254
|
+
- lib/dover_to_calais/models.rb
|
172
255
|
- lib/dover_to_calais/ontology.rb
|
173
256
|
- lib/dover_to_calais/version.rb
|
174
257
|
- test/test_file_1.doc
|
@@ -202,8 +285,10 @@ signing_key:
|
|
202
285
|
specification_version: 4
|
203
286
|
summary: An easy-to-use wrapper round the OpenCalais semantic analysis web service.
|
204
287
|
test_files:
|
288
|
+
- features/data_mining.feature
|
205
289
|
- features/data_sources.feature
|
206
290
|
- features/filtering.feature
|
291
|
+
- features/step_definitions/data_mining_steps.rb
|
207
292
|
- features/step_definitions/data_sources_steps.rb
|
208
293
|
- features/step_definitions/filtering_steps.rb
|
209
294
|
- test/test_file_1.doc
|