calais 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,14 @@
1
+ == 0.0.5
2
+
3
+ * fixed error where classes weren't being required in the proper order on Ubuntu (reported by Jon Moses)
4
+ * New things coming back from the API. Fixing in tests.
5
+
6
+ == 0.0.4
7
+
8
+ * changed dependency from hpricot to libxml
9
+ * utf fun
10
+ * cleanup all around
11
+
1
12
  == 0.0.3
2
13
 
3
14
  * pluginized the library for Rails (via pius: http://gitorious.org/projects/calais-au-rails)
data/README.txt CHANGED
@@ -26,7 +26,7 @@ This will return an object containing the RDF representation of the text, the na
26
26
 
27
27
  * Ruby 1.8.5 or better
28
28
  * Uses the following standard libraries: digest/sha1, net/http, yaml, cgi
29
- * Hpricot
29
+ * libxml2 / libxml-ruby
30
30
 
31
31
  == Install
32
32
 
@@ -1,14 +1,20 @@
1
1
  require 'digest/sha1'
2
2
  require 'net/http'
3
- require 'yaml'
4
3
  require 'cgi'
4
+ require 'iconv'
5
5
 
6
6
  require 'rubygems'
7
- require 'hpricot'
7
+ require 'xml/libxml'
8
8
 
9
9
  $KCODE = "UTF8"
10
+ require 'jcode'
10
11
 
11
- Dir.glob(File.join(File.dirname(__FILE__), 'calais/*.rb')).each { |f| require f }
12
+ $:.unshift File.expand_path(File.dirname(__FILE__)) + '/calais'
13
+
14
+ require 'name'
15
+ require 'relationship'
16
+ require 'response'
17
+ require 'client'
12
18
 
13
19
  module Calais
14
20
  POST_URL = "http://api.opencalais.com"
@@ -35,13 +41,16 @@ module Calais
35
41
 
36
42
  class << self
37
43
  def enlighten(*args, &block) Client.new(*args, &block).call(:enlighten) end
44
+
38
45
  def process_document(*args, &block)
39
46
  data, error = Calais.enlighten(*args, &block)
40
- Client.process_data(data, error)
47
+ process_data(data, error)
41
48
  end
49
+
50
+ def process_data(data, error=nil) Response.new(data, error) end
42
51
  end
43
52
  end
44
53
 
45
54
  module Calais
46
- VERSION = '0.0.3'
55
+ VERSION = '0.0.5'
47
56
  end
@@ -11,37 +11,23 @@ module Calais
11
11
  yield(self) if block_given?
12
12
  end
13
13
 
14
- def call(method, times=1)
14
+ def call(method)
15
15
  method = method.intern unless method.is_a?(Symbol)
16
16
  raise ArgumentError.new("Unknown method: #{method}") unless AVAILABLE_METHODS.keys.include? method
17
17
 
18
18
  post_args = {
19
19
  "licenseID" => @license_id,
20
- "content" => @content,
20
+ "content" => Iconv.iconv('UTF-8//IGNORE', 'UTF-8', "#{@content} ").first[0..-2],
21
21
  "paramsXML" => params_xml
22
22
  }
23
23
 
24
24
  url = URI.parse(POST_URL + AVAILABLE_METHODS[method])
25
25
  resp, data = Net::HTTP.post_form(url, post_args)
26
26
 
27
- handle_response(resp, data, method, times)
28
- end
29
-
30
- def self.process_data(data, error=nil)
31
- Calais::Response.new(data, error)
27
+ return resp.is_a?(Net::HTTPOK) ? data : [data, "API Error: #{resp}"]
32
28
  end
33
29
 
34
30
  private
35
- def handle_response(resp, data, method, times)
36
- if resp.is_a? Net::HTTPOK
37
- [data, nil]
38
- elsif times >= MAX_RETRIES
39
- [data, "Too many retries: #{times}"]
40
- else
41
- call(method, times+1)
42
- end
43
- end
44
-
45
31
  def params_xml
46
32
  content_type = @content_type && AVAILABLE_CONTENT_TYPES.keys.include?(@content_type) ? AVAILABLE_CONTENT_TYPES[@content_type] : AVAILABLE_CONTENT_TYPES[DEFAULT_CONTENT_TYPE]
47
33
  output_format = @output_format && AVAILABLE_OUTPUT_FORMATS.keys.include?(@output_format) ? AVAILABLE_OUTPUT_FORMATS[@output_format] : AVAILABLE_OUTPUT_FORMATS[DEFAULT_OUTPUT_FORMAT]
@@ -7,15 +7,11 @@ module Calais
7
7
  @names = []
8
8
  @relationships = []
9
9
 
10
- parse_rdf(raw)
10
+ parse_raw(raw)
11
11
  return if @error
12
-
13
- h_doc = Hpricot.XML(@rdf)
14
- document_node = h_doc.root.search("//rdf:Description//c:document//..").remove.first
15
- signature_node = h_doc.root.search("//rdf:Description//c:signature//..").remove.first
16
- language_node = h_doc.root.search("//rdf:Description//c:lang//..").remove.first
17
- h_doc = parse_names(h_doc)
18
- h_doc = parse_relationships(h_doc)
12
+
13
+ parse_names
14
+ parse_relationships
19
15
  end
20
16
 
21
17
  Name::TYPES.each_pair do |method_name, type|
@@ -25,30 +21,23 @@ module Calais
25
21
  end
26
22
 
27
23
  private
28
- def parse_rdf(raw)
29
- @rdf = CGI::unescapeHTML Hpricot.XML(raw).at("/string").inner_html
30
- @hpricot = Hpricot.XML(@rdf)
31
- @error = Hpricot.XML(response).at("/Error/Exception").inner_html rescue @error
24
+ def parse_raw(raw)
25
+ @libxml = XML::Parser.string(XML::Parser.string(raw).parse.root.child.content).parse
26
+ @rdf = @libxml.to_s
27
+ @error = @libxml.find("/Error/Exception").first.content rescue @error
32
28
  end
33
29
 
34
- def parse_names(doc)
35
- name_elements = doc.root.search("//rdf:Description//c:name//..")
36
- @names = name_elements.map do |ele|
37
- name = ele.at("c:name").inner_html
38
- type = ele.at("rdf:type").attributes["rdf:resource"].split("/").last
39
- hash = ele.attributes["rdf:about"].split("/").last
40
-
41
- detection_nodes = doc.root.search("//rdf:Description//c:subject//..").collect! do |ele|
42
- ele unless ele.at("c:subject").attributes["rdf:resource"].match(hash).nil?
43
- end.compact
30
+ def parse_names
31
+ @names = @libxml.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '/em/e/')]/..").map do |n|
32
+ name = n.find_first("c:name").content
33
+ type = n.find_first("rdf:type").properties.to_a.assoc("resource").last.split('/').last
34
+ hash = n.properties.to_a.assoc("about").last.split("/").last
44
35
 
45
- locations = detection_nodes.map do |ele|
46
- start = ele.at("c:offset").inner_html.to_i
47
- Range.new(start, start+ele.at("c:length").inner_html.to_i)
36
+ locations = @libxml.root.find("rdf:Description/c:subject[contains(@rdf:resource, '#{hash}')]/..").map do |n2|
37
+ start = n2.find_first("c:offset").content.to_i
38
+ Range.new(start, start+n2.find_first("c:length").content.to_i)
48
39
  end
49
40
 
50
- detection_nodes.remove
51
-
52
41
  Name.new(
53
42
  :name => name,
54
43
  :hash => hash,
@@ -56,47 +45,34 @@ module Calais
56
45
  :locations => locations
57
46
  )
58
47
  end
59
- name_elements.remove
60
-
61
- doc
62
48
  end
63
49
 
64
- def parse_relationships(doc)
65
- relationship_elements = doc.root.search("rdf:Description")
66
- @relationships = relationship_elements.map do |ele|
67
- next if ele.at("c:docId")
68
-
69
- hash = ele.attributes["rdf:about"].split("/").last
70
- type = ele.at("rdf:type").attributes["rdf:resource"].split("/").last
50
+ def parse_relationships
51
+ @libxml.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '/em/r')]/..").each do |n|
52
+ hash = n.properties.to_a.assoc("about").last.split("/").last
53
+ type = n.find_first("rdf:type").properties.to_a.assoc("resource").last.split('/').last
54
+
71
55
  metadata = {}
72
- ele.children.each do |child|
73
- next if child.comment? || child.name == "rdf:type"
74
-
75
- value = if child.attributes["rdf:resource"]
76
- Name.find_in_names(child.attributes["rdf:resource"].split("/").last, @names) rescue nil
77
- else
78
- child.inner_html.strip
79
- end
80
- metadata[child.name.split(":").last] = value
56
+
57
+ n.to_a.each do |n2|
58
+ next if n2.name == "type" or n2.comment?
59
+ resource = n2.properties.to_a.assoc("resource")
60
+ metadata[n2.name] = resource ? Name.find_in_names(resource.last.split("/").last, @names) : n2.content.strip
81
61
  end
82
62
 
83
- locations = doc.root.search("//rdf:Description//c:docId//..").collect! do |ele|
84
- ele unless ele.at("c:subject").attributes["rdf:resource"].match(hash).nil?
85
- end.compact.map do |ele|
86
- start = ele.at("c:offset").inner_html.to_i
87
- Range.new(start, start+ele.at("c:length").inner_html.to_i)
63
+ locations = @libxml.root.find("rdf:Description/c:subject[contains(@rdf:resource, '#{hash}')]/..").map do |n2|
64
+ start = n2.find_first("c:offset").content.to_i
65
+ Range.new(start, start+n2.find_first("c:length").content.to_i)
88
66
  end
89
67
 
90
- Relationship.new(
68
+
69
+ @relationships << Relationship.new(
91
70
  :type => type,
92
71
  :hash => hash,
93
72
  :metadata => metadata,
94
73
  :locations => locations
95
74
  )
96
- end.compact
97
- relationship_elements.remove
98
-
99
- doc
75
+ end
100
76
  end
101
77
  end
102
78
  end
@@ -52,7 +52,7 @@ describe Calais, ".process_document" do
52
52
  it "returns relationships" do
53
53
  @response.relationships.should_not be_nil
54
54
  @response.relationships.should_not be_empty
55
- @response.relationships.map {|r| r.type }.should == ["PersonProfessional"]
55
+ @response.relationships.map {|r| r.type }.should == ["Quotation", "Quotation", "PersonProfessional", "Quotation", "Quotation", "Quotation", "Quotation"]
56
56
  end
57
57
 
58
58
  end
metadata CHANGED
@@ -1,57 +1,33 @@
1
1
  --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.2
3
+ specification_version: 1
2
4
  name: calais
3
5
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
5
- platform: ruby
6
- authors:
7
- - Abhay Kumar
6
+ version: 0.0.5
7
+ date: 2008-03-12 00:00:00 -07:00
8
+ summary: A Ruby interface to the Calais Web Service
9
+ require_paths:
10
+ - lib
11
+ email: info@opensynapse.net
12
+ homepage: http://calais.rubyforge.org
13
+ rubyforge_project: calais
14
+ description: "== Features * Accepts documents in text/plain, text/xml and text/html format. * Basic access to the Open Calais API's Enlighten action. * Output is RDF representation of input document. * Single function ability to tag a document and receive a response in RDF format, names in the document, and their relationships. == Synopsis This is a very basic wrapper to the Open Calais API. It uses the POST endpoint and currently supports the Enlighten action. Here's a simple call: Calais.enlighten(:content => \"The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.\", :content_type => :text, :license_id => LICENSE_ID) This is the easiest way to get the RDF-formated response from the OpenCalais service. If you want to do something more fun like getting all sorts of fun information about a document, you can try this: Calais.process_document(:content => \"The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.\", :content_type => :text, :license_id => LICENSE_ID) This will return an object containing the RDF representation of the text, the names in the text, and any relationships that exist there."
8
15
  autorequire:
16
+ default_executable:
9
17
  bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
10
27
  cert_chain:
11
- - |
12
- -----BEGIN CERTIFICATE-----
13
- MIIDNjCCAh6gAwIBAgIBADANBgkqhkiG9w0BAQUFADBBMQ0wCwYDVQQDDARpbmZv
14
- MRswGQYKCZImiZPyLGQBGRYLb3BlbnN5bmFwc2UxEzARBgoJkiaJk/IsZAEZFgNu
15
- ZXQwHhcNMDgwMjAzMDUwODQzWhcNMDkwMjAyMDUwODQzWjBBMQ0wCwYDVQQDDARp
16
- bmZvMRswGQYKCZImiZPyLGQBGRYLb3BlbnN5bmFwc2UxEzARBgoJkiaJk/IsZAEZ
17
- FgNuZXQwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCmaA3Od1p42luz
18
- zDJepXD3VBFEmmeCUCOjs8rkGIlhRibBvAU8GB0hhkTUykeF6JvAp68FYtIqyTqM
19
- EY7bnyYTWsvX7HrX/wGRshSKZPnxn2b0AnZ9T3QQZyUut1YQ5G+kBQrI76hz9ynA
20
- l0mPCiGxrh+yUNTKt7KzOAzQbtPlqGiIzj+aYvzmdEsj24Ekm/11A/ntPnz+N/Wj
21
- yS5c2tbfZdU8NfwfHCZQUBE4PROYCCjoly0QChvBQzKSZPrEpJB3EedMUyBc5m5E
22
- TQ0u5aItr3isQchwo410x7ixzVveVzn4mchaGCZ3ZuPwaQkuI/7KSSWWH1LCouct
23
- N7LsWR7jAgMBAAGjOTA3MAkGA1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQW
24
- BBRsRhkAWj4iWaut121ZcaOAKXG27DANBgkqhkiG9w0BAQUFAAOCAQEAFuqEVgKC
25
- U6f237SZ/hzevOwRkaErF1EcaCEVzuNj+KNdbQOK9oOo+hHyos3jUo17TiUNDi+3
26
- VJhw3cOkA/PEpa0ou0Vm8VIfXdp6dh62NhTKHBVwQ/qXHnn3aVuV/zIfOmi9WQ+t
27
- mr7ehGTw7URly95GOESW4NKQ95p+iquAh/NGhtHGFt+nxjJGUkkYlnGVaxmmgof3
28
- sP2hOrejIrD9jAoejiRhiA+IyEoaYJvlh+D+3MngvnyDFqHiFZgngM0fvTnMTsgT
29
- avOOKhLsesocjiElkLMv8mwuY+L8P4tSvDTDKXxM9Bx/YagwgzYCqPoGtFdWI/GI
30
- +keKvrmaTOJ7CQ==
31
- -----END CERTIFICATE-----
32
-
33
- date: 2008-02-07 00:00:00 -08:00
34
- default_executable:
35
- dependencies:
36
- - !ruby/object:Gem::Dependency
37
- name: hoe
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
40
- requirements:
41
- - - ">="
42
- - !ruby/object:Gem::Version
43
- version: 1.5.0
44
- version:
45
- description: "== Features * Accepts documents in text/plain, text/xml and text/html format. * Basic access to the Open Calais API's Enlighten action. * Output is RDF representation of input document. * Single function ability to tag a document and receive a response in RDF format, names in the document, and their relationships. == Synopsis This is a very basic wrapper to the Open Calais API. It uses the POST endpoint and currently supports the Enlighten action. Here's a simple call: Calais.enlighten(:content => \"The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.\", :content_type => :text, :license_id => LICENSE_ID) This is the easiest way to get the RDF-formated response from the OpenCalais service. If you want to do something more fun like getting all sorts of fun information about a document, you can try this: Calais.process_document(:content => \"The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.\", :content_type => :text, :license_id => LICENSE_ID) This will return an object containing the RDF representation of the text, the names in the text, and any relationships that exist there."
46
- email: info@opensynapse.net
47
- executables: []
48
-
49
- extensions: []
50
-
51
- extra_rdoc_files:
52
- - History.txt
53
- - Manifest.txt
54
- - README.txt
28
+ post_install_message:
29
+ authors:
30
+ - Abhay Kumar
55
31
  files:
56
32
  - History.txt
57
33
  - MIT-LICENSE
@@ -71,32 +47,28 @@ files:
71
47
  - spec/fixtures/slovenia_euro.xml
72
48
  - spec/helper.rb
73
49
  - spec/spec.opts
74
- has_rdoc: true
75
- homepage: http://calais.rubyforge.org
76
- post_install_message:
50
+ test_files: []
51
+
77
52
  rdoc_options:
78
53
  - --main
79
54
  - README.txt
80
- require_paths:
81
- - lib
82
- required_ruby_version: !ruby/object:Gem::Requirement
83
- requirements:
84
- - - ">="
85
- - !ruby/object:Gem::Version
86
- version: "0"
87
- version:
88
- required_rubygems_version: !ruby/object:Gem::Requirement
89
- requirements:
90
- - - ">="
91
- - !ruby/object:Gem::Version
92
- version: "0"
93
- version:
94
- requirements: []
55
+ extra_rdoc_files:
56
+ - History.txt
57
+ - Manifest.txt
58
+ - README.txt
59
+ executables: []
95
60
 
96
- rubyforge_project: calais
97
- rubygems_version: 1.0.1
98
- signing_key:
99
- specification_version: 2
100
- summary: A Ruby interface to the Calais Web Service
101
- test_files: []
61
+ extensions: []
62
+
63
+ requirements: []
102
64
 
65
+ dependencies:
66
+ - !ruby/object:Gem::Dependency
67
+ name: hoe
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Version::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: 1.4.0
74
+ version:
data.tar.gz.sig DELETED
@@ -1 +0,0 @@
1
- ��i/>�[ɵ:��m��<����
metadata.gz.sig DELETED
Binary file