calais 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data.tar.gz.sig CHANGED
Binary file
data/History.txt CHANGED
@@ -1,3 +1,11 @@
1
+ == 0.0.2
2
+
3
+ * cleanup in the specs
4
+ * cleaner parsing
5
+ * location of named entities
6
+ * more data in relationships
7
+ * moved Names and Relationships
8
+
1
9
  == 0.0.1
2
10
 
3
11
  * Access to Calais's Enlighten action
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2008 Abhay Kumar info@opensynapse.net
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ 'Software'), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Manifest.txt CHANGED
@@ -1,9 +1,12 @@
1
1
  History.txt
2
+ MIT-LICENSE
2
3
  Manifest.txt
3
4
  README.txt
4
5
  Rakefile
5
6
  lib/calais.rb
6
7
  lib/calais/client.rb
8
+ lib/calais/name.rb
9
+ lib/calais/relationship.rb
7
10
  lib/calais/response.rb
8
11
  spec/calais_spec.rb
9
12
  spec/fixtures/.gitignore
data/README.txt CHANGED
@@ -30,7 +30,7 @@ This will return an object containing the RDF representation of the text, the na
30
30
 
31
31
  == Install
32
32
 
33
- TODO
33
+ You can install the Calais gem via Rubygems (gem install calais) or by building from source.
34
34
 
35
35
  == Authors
36
36
 
data/lib/calais.rb CHANGED
@@ -43,5 +43,5 @@ module Calais
43
43
  end
44
44
 
45
45
  module Calais
46
- VERSION = '0.0.1'
46
+ VERSION = '0.0.2'
47
47
  end
@@ -0,0 +1,13 @@
1
+ module Calais
2
+ class Name
3
+ attr_accessor :name, :type, :hash, :locations
4
+
5
+ def initialize(args={})
6
+ args.each {|k,v| send("#{k}=", v)}
7
+ end
8
+
9
+ def self.find_in_names(hash, names)
10
+ names.select {|name| name.hash == hash }.first
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,9 @@
1
+ module Calais
2
+ class Relationship
3
+ attr_accessor :type, :hash, :metadata, :locations
4
+
5
+ def initialize(args={})
6
+ args.each {|k,v| send("#{k}=", v)}
7
+ end
8
+ end
9
+ end
@@ -8,38 +8,14 @@ module Calais
8
8
  @relationships = []
9
9
 
10
10
  parse_rdf(raw)
11
- parse_names
12
- parse_relationships
13
- end
14
-
15
- def to_dot
16
- used = []
17
- id = @hpricot.at("rdf:Description//c:document//..").attributes["c:externalID"]
18
-
19
- dot = "digraph \"#{id}\"\n"
20
- dot += "{\n"
21
- dot += "\tgraph [rankdir=LR, overlap=false];\n"
22
- dot += "\tnode [shape = circle];"
11
+ return if @error
23
12
 
24
- @relationships.each do |rel|
25
- dot += "\t\"#{rel.actor.name}\" -> \"#{rel.target.name}\""
26
- dot += " ["
27
- dot += "label=\""
28
- dot += "#{rel.metadata} " if rel.metadata
29
- dot += "(#{rel.type})"
30
- dot += "\"];\n"
31
- used |= [rel.actor.hash, rel.target.hash]
32
- end
33
-
34
- @names.each do |name|
35
- dot += "\t\"#{name.name}\";\n" unless used.include?(name.hash)
36
- end
37
- dot += "}\n"
38
-
39
- f = File.open("#{id}.dot", 'w')
40
- f.puts dot
41
- f.close
42
-
13
+ h_doc = Hpricot.XML(@rdf)
14
+ document_node = h_doc.root.search("//rdf:Description//c:document//..").remove.first
15
+ signature_node = h_doc.root.search("//rdf:Description//c:signature//..").remove.first
16
+ language_node = h_doc.root.search("//rdf:Description//c:lang//..").remove.first
17
+ h_doc = parse_names(h_doc)
18
+ h_doc = parse_relationships(h_doc)
43
19
  end
44
20
 
45
21
  private
@@ -49,59 +25,72 @@ module Calais
49
25
  @error = Hpricot.XML(response).at("/Error/Exception").inner_html rescue @error
50
26
  end
51
27
 
52
- def parse_names
53
- @names = @hpricot.root.search("rdf:Description//c:name//..").map do |ele|
54
- Calais::Response::Name.new(
55
- :name => ele.at("c:name").inner_html,
56
- :hash => ele.attributes["rdf:about"].split('/').last,
57
- :type => ele.at("rdf:type").attributes["rdf:resource"].split('/').last
58
- )
59
- end unless @error
60
- end
61
-
62
- def parse_relationships
63
- doc = Hpricot.XML(@rdf)
64
- doc.search("rdf:Description//c:docId//..").remove
65
- doc.search("rdf:Description//c:document//..").remove
66
- doc.search("rdf:Description//c:name//..").remove
67
-
68
- @relationships = doc.root.search("rdf:Description").map do |ele|
69
- relationship = ele.at("rdf:type")
70
- actor = relationship.next_sibling
71
- metadata = actor.next_sibling.attributes["rdf:resource"] ? nil : actor.next_sibling.inner_html.strip
72
- target = metadata ? actor.next_sibling.next_sibling : actor.next_sibling
28
+ def parse_names(doc)
29
+ name_elements = doc.root.search("//rdf:Description//c:name//..")
30
+ @names = name_elements.map do |ele|
31
+ name = ele.at("c:name").inner_html
32
+ type = ele.at("rdf:type").attributes["rdf:resource"].split("/").last
33
+ hash = ele.attributes["rdf:about"].split("/").last
73
34
 
74
- actor_name = actor ? Name.find_in_names(actor.attributes["rdf:resource"].split('/').last, @names) : nil
75
- target_name = target ? Name.find_in_names(target.attributes["rdf:resource"].split('/').last, @names) : nil
76
-
77
- Calais::Response::Relationship.new(
78
- :type => relationship.attributes["rdf:resource"].split('/').last,
79
- :actor => actor_name,
80
- :target => target_name,
81
- :metadata => metadata
35
+ detection_nodes = doc.root.search("//rdf:Description//c:subject//..").collect! do |ele|
36
+ ele unless ele.at("c:subject").attributes["rdf:resource"].match(hash).nil?
37
+ end.compact
38
+
39
+ locations = detection_nodes.map do |ele|
40
+ start = ele.at("c:offset").inner_html.to_i
41
+ Range.new(start, start+ele.at("c:length").inner_html.to_i)
42
+ end
43
+
44
+ detection_nodes.remove
45
+
46
+ Name.new(
47
+ :name => name,
48
+ :hash => hash,
49
+ :type => type,
50
+ :locations => locations
82
51
  )
83
52
  end
84
- end
85
-
86
- class Name
87
- include Comparable
88
- attr_accessor :name, :type, :hash
89
-
90
- def initialize(args={})
91
- args.each {|k,v| send("#{k}=", v)}
53
+ name_elements.remove
54
+
55
+ doc
92
56
  end
93
57
 
94
- def self.find_in_names(hash, names)
95
- names.select {|name| name.hash == hash }.first
96
- end
97
- end
98
-
99
- class Relationship
100
- attr_accessor :type, :actor, :target, :metadata
101
-
102
- def initialize(args={})
103
- args.each {|k,v| send("#{k}=", v)}
58
+ def parse_relationships(doc)
59
+ relationship_elements = doc.root.search("rdf:Description")
60
+ @relationships = relationship_elements.map do |ele|
61
+ next if ele.at("c:docId")
62
+
63
+ hash = ele.attributes["rdf:about"].split("/").last
64
+ type = ele.at("rdf:type").attributes["rdf:resource"].split("/").last
65
+ metadata = {}
66
+ ele.children.each do |child|
67
+ next if child.comment? || child.name == "rdf:type"
68
+
69
+ value = if child.attributes["rdf:resource"]
70
+ Name.find_in_names(child.attributes["rdf:resource"].split("/").last, @names) rescue nil
71
+ else
72
+ child.inner_html.strip
73
+ end
74
+ metadata[child.name.split(":").last] = value
75
+ end
76
+
77
+ locations = doc.root.search("//rdf:Description//c:docId//..").collect! do |ele|
78
+ ele unless ele.at("c:subject").attributes["rdf:resource"].match(hash).nil?
79
+ end.compact.map do |ele|
80
+ start = ele.at("c:offset").inner_html.to_i
81
+ Range.new(start, start+ele.at("c:length").inner_html.to_i)
82
+ end
83
+
84
+ Relationship.new(
85
+ :type => type,
86
+ :hash => hash,
87
+ :metadata => metadata,
88
+ :locations => locations
89
+ )
90
+ end.compact
91
+ relationship_elements.remove
92
+
93
+ doc
104
94
  end
105
- end
106
95
  end
107
96
  end
data/spec/calais_spec.rb CHANGED
@@ -36,17 +36,25 @@ describe Calais::Client, ".new" do
36
36
  end
37
37
 
38
38
  describe Calais, ".process_document" do
39
+ before(:all) { @response = Calais.process_document(:content => SAMPLE_DOCUMENT, :content_type => :xml, :license_id => LICENSE_ID) }
40
+
39
41
  it "returns a Calais::Response" do
40
- response = Calais.process_document(:content => SAMPLE_DOCUMENT, :content_type => :xml, :license_id => LICENSE_ID)
41
- response.should_not be_nil
42
- response.should be_a_kind_of(Calais::Response)
42
+ @response.should_not be_nil
43
+ @response.should be_a_kind_of(Calais::Response)
43
44
  end
44
-
45
- it "returns a Calais::Response (with relationships)" do
46
- response = Calais.process_document(:content => File.read(File.join(File.dirname(__FILE__), 'fixtures', 'bicycles_austrailia.xml')), :content_type => :xml, :license_id => LICENSE_ID)
47
- response.should_not be_nil
48
- response.should be_a_kind_of(Calais::Response)
45
+
46
+ it "returns names" do
47
+ @response.names.should_not be_nil
48
+ @response.names.should_not be_empty
49
+ @response.names.map {|n| n.name }.sort.should == ["Australia", "Australia", "Cycling Promotion Fund", "Ian Christie", "car manufacturers", "car market", "car sales", "company car"]
49
50
  end
51
+
52
+ it "returns relationships" do
53
+ @response.relationships.should_not be_nil
54
+ @response.relationships.should_not be_empty
55
+ @response.relationships.map {|r| r.type }.should == ["PersonProfessional"]
56
+ end
57
+
50
58
  end
51
59
 
52
60
  describe Calais::Client, ".call" do
@@ -67,6 +75,6 @@ describe Calais::Client, ".params_xml" do
67
75
  it "returns an xml encoded string" do
68
76
  client = Calais::Client.new(:content => SAMPLE_DOCUMENT, :content_type => :xml, :license_id => LICENSE_ID)
69
77
  client.send("params_xml").should_not be_nil
70
- client.send("params_xml").should == %[<c:params xmlns:c=\"http://s.opencalais.com/1/pred/\" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"><c:processingDirectives c:contentType=\"TEXT/XML\" c:outputFormat=\"XML/RDF\"></c:processingDirectives><c:userDirectives c:allowDistribution=\"false\" c:allowSearch=\"false\" c:externalID=\"dc68d5a382724c2238d9f22ba9c0b4d2581569d8\" c:submitter=\"calais.rb\"></c:userDirectives><c:externalMetadata></c:externalMetadata></c:params>]
78
+ client.send("params_xml").should == %[<c:params xmlns:c=\"http://s.opencalais.com/1/pred/\" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"><c:processingDirectives c:contentType=\"TEXT/XML\" c:outputFormat=\"XML/RDF\"></c:processingDirectives><c:userDirectives c:allowDistribution=\"false\" c:allowSearch=\"false\" c:externalID=\"4a661f3cd285d43fa4df971e14e623eb51748e27\" c:submitter=\"calais.rb\"></c:userDirectives><c:externalMetadata></c:externalMetadata></c:params>]
71
79
  end
72
80
  end
data/spec/helper.rb CHANGED
@@ -4,5 +4,5 @@ require 'yaml'
4
4
 
5
5
  require File.dirname(__FILE__) + '/../lib/calais'
6
6
 
7
- SAMPLE_DOCUMENT = File.read(File.join(File.dirname(__FILE__), 'fixtures/slovenia_euro.xml'))
7
+ SAMPLE_DOCUMENT = File.read(File.join(File.dirname(__FILE__), 'fixtures/bicycles_austrailia.xml'))
8
8
  LICENSE_ID = YAML.load(File.read(File.join(File.dirname(__FILE__), 'fixtures/calais.yml')))['key']
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: calais
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Abhay Kumar
@@ -30,7 +30,7 @@ cert_chain:
30
30
  +keKvrmaTOJ7CQ==
31
31
  -----END CERTIFICATE-----
32
32
 
33
- date: 2008-02-02 00:00:00 -08:00
33
+ date: 2008-02-07 00:00:00 -08:00
34
34
  default_executable:
35
35
  dependencies:
36
36
  - !ruby/object:Gem::Dependency
@@ -54,11 +54,14 @@ extra_rdoc_files:
54
54
  - README.txt
55
55
  files:
56
56
  - History.txt
57
+ - MIT-LICENSE
57
58
  - Manifest.txt
58
59
  - README.txt
59
60
  - Rakefile
60
61
  - lib/calais.rb
61
62
  - lib/calais/client.rb
63
+ - lib/calais/name.rb
64
+ - lib/calais/relationship.rb
62
65
  - lib/calais/response.rb
63
66
  - spec/calais_spec.rb
64
67
  - spec/fixtures/.gitignore
metadata.gz.sig CHANGED
Binary file