calais 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,14 @@
1
+ == 0.0.5
2
+
3
+ * fixed error where classes weren't being required in the proper order on Ubuntu (reported by Jon Moses)
4
+ * New things coming back from the API. Fixing in tests.
5
+
6
+ == 0.0.4
7
+
8
+ * changed dependency from hpricot to libxml
9
+ * utf fun
10
+ * cleanup all around
11
+
1
12
  == 0.0.3
2
13
 
3
14
  * pluginized the library for Rails (via pius: http://gitorious.org/projects/calais-au-rails)
data/README.txt CHANGED
@@ -26,7 +26,7 @@ This will return an object containing the RDF representation of the text, the na
26
26
 
27
27
  * Ruby 1.8.5 or better
28
28
  * Uses the following standard libraries: digest/sha1, net/http, yaml, cgi
29
- * Hpricot
29
+ * libxml2 / libxml-ruby
30
30
 
31
31
  == Install
32
32
 
@@ -1,14 +1,20 @@
1
1
  require 'digest/sha1'
2
2
  require 'net/http'
3
- require 'yaml'
4
3
  require 'cgi'
4
+ require 'iconv'
5
5
 
6
6
  require 'rubygems'
7
- require 'hpricot'
7
+ require 'xml/libxml'
8
8
 
9
9
  $KCODE = "UTF8"
10
+ require 'jcode'
10
11
 
11
- Dir.glob(File.join(File.dirname(__FILE__), 'calais/*.rb')).each { |f| require f }
12
+ $:.unshift File.expand_path(File.dirname(__FILE__)) + '/calais'
13
+
14
+ require 'name'
15
+ require 'relationship'
16
+ require 'response'
17
+ require 'client'
12
18
 
13
19
  module Calais
14
20
  POST_URL = "http://api.opencalais.com"
@@ -35,13 +41,16 @@ module Calais
35
41
 
36
42
  class << self
37
43
  def enlighten(*args, &block) Client.new(*args, &block).call(:enlighten) end
44
+
38
45
  def process_document(*args, &block)
39
46
  data, error = Calais.enlighten(*args, &block)
40
- Client.process_data(data, error)
47
+ process_data(data, error)
41
48
  end
49
+
50
+ def process_data(data, error=nil) Response.new(data, error) end
42
51
  end
43
52
  end
44
53
 
45
54
  module Calais
46
- VERSION = '0.0.3'
55
+ VERSION = '0.0.5'
47
56
  end
@@ -11,37 +11,23 @@ module Calais
11
11
  yield(self) if block_given?
12
12
  end
13
13
 
14
- def call(method, times=1)
14
+ def call(method)
15
15
  method = method.intern unless method.is_a?(Symbol)
16
16
  raise ArgumentError.new("Unknown method: #{method}") unless AVAILABLE_METHODS.keys.include? method
17
17
 
18
18
  post_args = {
19
19
  "licenseID" => @license_id,
20
- "content" => @content,
20
+ "content" => Iconv.iconv('UTF-8//IGNORE', 'UTF-8', "#{@content} ").first[0..-2],
21
21
  "paramsXML" => params_xml
22
22
  }
23
23
 
24
24
  url = URI.parse(POST_URL + AVAILABLE_METHODS[method])
25
25
  resp, data = Net::HTTP.post_form(url, post_args)
26
26
 
27
- handle_response(resp, data, method, times)
28
- end
29
-
30
- def self.process_data(data, error=nil)
31
- Calais::Response.new(data, error)
27
+ return resp.is_a?(Net::HTTPOK) ? data : [data, "API Error: #{resp}"]
32
28
  end
33
29
 
34
30
  private
35
- def handle_response(resp, data, method, times)
36
- if resp.is_a? Net::HTTPOK
37
- [data, nil]
38
- elsif times >= MAX_RETRIES
39
- [data, "Too many retries: #{times}"]
40
- else
41
- call(method, times+1)
42
- end
43
- end
44
-
45
31
  def params_xml
46
32
  content_type = @content_type && AVAILABLE_CONTENT_TYPES.keys.include?(@content_type) ? AVAILABLE_CONTENT_TYPES[@content_type] : AVAILABLE_CONTENT_TYPES[DEFAULT_CONTENT_TYPE]
47
33
  output_format = @output_format && AVAILABLE_OUTPUT_FORMATS.keys.include?(@output_format) ? AVAILABLE_OUTPUT_FORMATS[@output_format] : AVAILABLE_OUTPUT_FORMATS[DEFAULT_OUTPUT_FORMAT]
@@ -7,15 +7,11 @@ module Calais
7
7
  @names = []
8
8
  @relationships = []
9
9
 
10
- parse_rdf(raw)
10
+ parse_raw(raw)
11
11
  return if @error
12
-
13
- h_doc = Hpricot.XML(@rdf)
14
- document_node = h_doc.root.search("//rdf:Description//c:document//..").remove.first
15
- signature_node = h_doc.root.search("//rdf:Description//c:signature//..").remove.first
16
- language_node = h_doc.root.search("//rdf:Description//c:lang//..").remove.first
17
- h_doc = parse_names(h_doc)
18
- h_doc = parse_relationships(h_doc)
12
+
13
+ parse_names
14
+ parse_relationships
19
15
  end
20
16
 
21
17
  Name::TYPES.each_pair do |method_name, type|
@@ -25,30 +21,23 @@ module Calais
25
21
  end
26
22
 
27
23
  private
28
- def parse_rdf(raw)
29
- @rdf = CGI::unescapeHTML Hpricot.XML(raw).at("/string").inner_html
30
- @hpricot = Hpricot.XML(@rdf)
31
- @error = Hpricot.XML(response).at("/Error/Exception").inner_html rescue @error
24
+ def parse_raw(raw)
25
+ @libxml = XML::Parser.string(XML::Parser.string(raw).parse.root.child.content).parse
26
+ @rdf = @libxml.to_s
27
+ @error = @libxml.find("/Error/Exception").first.content rescue @error
32
28
  end
33
29
 
34
- def parse_names(doc)
35
- name_elements = doc.root.search("//rdf:Description//c:name//..")
36
- @names = name_elements.map do |ele|
37
- name = ele.at("c:name").inner_html
38
- type = ele.at("rdf:type").attributes["rdf:resource"].split("/").last
39
- hash = ele.attributes["rdf:about"].split("/").last
40
-
41
- detection_nodes = doc.root.search("//rdf:Description//c:subject//..").collect! do |ele|
42
- ele unless ele.at("c:subject").attributes["rdf:resource"].match(hash).nil?
43
- end.compact
30
+ def parse_names
31
+ @names = @libxml.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '/em/e/')]/..").map do |n|
32
+ name = n.find_first("c:name").content
33
+ type = n.find_first("rdf:type").properties.to_a.assoc("resource").last.split('/').last
34
+ hash = n.properties.to_a.assoc("about").last.split("/").last
44
35
 
45
- locations = detection_nodes.map do |ele|
46
- start = ele.at("c:offset").inner_html.to_i
47
- Range.new(start, start+ele.at("c:length").inner_html.to_i)
36
+ locations = @libxml.root.find("rdf:Description/c:subject[contains(@rdf:resource, '#{hash}')]/..").map do |n2|
37
+ start = n2.find_first("c:offset").content.to_i
38
+ Range.new(start, start+n2.find_first("c:length").content.to_i)
48
39
  end
49
40
 
50
- detection_nodes.remove
51
-
52
41
  Name.new(
53
42
  :name => name,
54
43
  :hash => hash,
@@ -56,47 +45,34 @@ module Calais
56
45
  :locations => locations
57
46
  )
58
47
  end
59
- name_elements.remove
60
-
61
- doc
62
48
  end
63
49
 
64
- def parse_relationships(doc)
65
- relationship_elements = doc.root.search("rdf:Description")
66
- @relationships = relationship_elements.map do |ele|
67
- next if ele.at("c:docId")
68
-
69
- hash = ele.attributes["rdf:about"].split("/").last
70
- type = ele.at("rdf:type").attributes["rdf:resource"].split("/").last
50
+ def parse_relationships
51
+ @libxml.root.find("rdf:Description/rdf:type[contains(@rdf:resource, '/em/r')]/..").each do |n|
52
+ hash = n.properties.to_a.assoc("about").last.split("/").last
53
+ type = n.find_first("rdf:type").properties.to_a.assoc("resource").last.split('/').last
54
+
71
55
  metadata = {}
72
- ele.children.each do |child|
73
- next if child.comment? || child.name == "rdf:type"
74
-
75
- value = if child.attributes["rdf:resource"]
76
- Name.find_in_names(child.attributes["rdf:resource"].split("/").last, @names) rescue nil
77
- else
78
- child.inner_html.strip
79
- end
80
- metadata[child.name.split(":").last] = value
56
+
57
+ n.to_a.each do |n2|
58
+ next if n2.name == "type" or n2.comment?
59
+ resource = n2.properties.to_a.assoc("resource")
60
+ metadata[n2.name] = resource ? Name.find_in_names(resource.last.split("/").last, @names) : n2.content.strip
81
61
  end
82
62
 
83
- locations = doc.root.search("//rdf:Description//c:docId//..").collect! do |ele|
84
- ele unless ele.at("c:subject").attributes["rdf:resource"].match(hash).nil?
85
- end.compact.map do |ele|
86
- start = ele.at("c:offset").inner_html.to_i
87
- Range.new(start, start+ele.at("c:length").inner_html.to_i)
63
+ locations = @libxml.root.find("rdf:Description/c:subject[contains(@rdf:resource, '#{hash}')]/..").map do |n2|
64
+ start = n2.find_first("c:offset").content.to_i
65
+ Range.new(start, start+n2.find_first("c:length").content.to_i)
88
66
  end
89
67
 
90
- Relationship.new(
68
+
69
+ @relationships << Relationship.new(
91
70
  :type => type,
92
71
  :hash => hash,
93
72
  :metadata => metadata,
94
73
  :locations => locations
95
74
  )
96
- end.compact
97
- relationship_elements.remove
98
-
99
- doc
75
+ end
100
76
  end
101
77
  end
102
78
  end
@@ -52,7 +52,7 @@ describe Calais, ".process_document" do
52
52
  it "returns relationships" do
53
53
  @response.relationships.should_not be_nil
54
54
  @response.relationships.should_not be_empty
55
- @response.relationships.map {|r| r.type }.should == ["PersonProfessional"]
55
+ @response.relationships.map {|r| r.type }.should == ["Quotation", "Quotation", "PersonProfessional", "Quotation", "Quotation", "Quotation", "Quotation"]
56
56
  end
57
57
 
58
58
  end
metadata CHANGED
@@ -1,57 +1,33 @@
1
1
  --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.2
3
+ specification_version: 1
2
4
  name: calais
3
5
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
5
- platform: ruby
6
- authors:
7
- - Abhay Kumar
6
+ version: 0.0.5
7
+ date: 2008-03-12 00:00:00 -07:00
8
+ summary: A Ruby interface to the Calais Web Service
9
+ require_paths:
10
+ - lib
11
+ email: info@opensynapse.net
12
+ homepage: http://calais.rubyforge.org
13
+ rubyforge_project: calais
14
+ description: "== Features * Accepts documents in text/plain, text/xml and text/html format. * Basic access to the Open Calais API's Enlighten action. * Output is RDF representation of input document. * Single function ability to tag a document and receive a response in RDF format, names in the document, and their relationships. == Synopsis This is a very basic wrapper to the Open Calais API. It uses the POST endpoint and currently supports the Enlighten action. Here's a simple call: Calais.enlighten(:content => \"The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.\", :content_type => :text, :license_id => LICENSE_ID) This is the easiest way to get the RDF-formated response from the OpenCalais service. If you want to do something more fun like getting all sorts of fun information about a document, you can try this: Calais.process_document(:content => \"The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.\", :content_type => :text, :license_id => LICENSE_ID) This will return an object containing the RDF representation of the text, the names in the text, and any relationships that exist there."
8
15
  autorequire:
16
+ default_executable:
9
17
  bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
10
27
  cert_chain:
11
- - |
12
- -----BEGIN CERTIFICATE-----
13
- MIIDNjCCAh6gAwIBAgIBADANBgkqhkiG9w0BAQUFADBBMQ0wCwYDVQQDDARpbmZv
14
- MRswGQYKCZImiZPyLGQBGRYLb3BlbnN5bmFwc2UxEzARBgoJkiaJk/IsZAEZFgNu
15
- ZXQwHhcNMDgwMjAzMDUwODQzWhcNMDkwMjAyMDUwODQzWjBBMQ0wCwYDVQQDDARp
16
- bmZvMRswGQYKCZImiZPyLGQBGRYLb3BlbnN5bmFwc2UxEzARBgoJkiaJk/IsZAEZ
17
- FgNuZXQwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCmaA3Od1p42luz
18
- zDJepXD3VBFEmmeCUCOjs8rkGIlhRibBvAU8GB0hhkTUykeF6JvAp68FYtIqyTqM
19
- EY7bnyYTWsvX7HrX/wGRshSKZPnxn2b0AnZ9T3QQZyUut1YQ5G+kBQrI76hz9ynA
20
- l0mPCiGxrh+yUNTKt7KzOAzQbtPlqGiIzj+aYvzmdEsj24Ekm/11A/ntPnz+N/Wj
21
- yS5c2tbfZdU8NfwfHCZQUBE4PROYCCjoly0QChvBQzKSZPrEpJB3EedMUyBc5m5E
22
- TQ0u5aItr3isQchwo410x7ixzVveVzn4mchaGCZ3ZuPwaQkuI/7KSSWWH1LCouct
23
- N7LsWR7jAgMBAAGjOTA3MAkGA1UdEwQCMAAwCwYDVR0PBAQDAgSwMB0GA1UdDgQW
24
- BBRsRhkAWj4iWaut121ZcaOAKXG27DANBgkqhkiG9w0BAQUFAAOCAQEAFuqEVgKC
25
- U6f237SZ/hzevOwRkaErF1EcaCEVzuNj+KNdbQOK9oOo+hHyos3jUo17TiUNDi+3
26
- VJhw3cOkA/PEpa0ou0Vm8VIfXdp6dh62NhTKHBVwQ/qXHnn3aVuV/zIfOmi9WQ+t
27
- mr7ehGTw7URly95GOESW4NKQ95p+iquAh/NGhtHGFt+nxjJGUkkYlnGVaxmmgof3
28
- sP2hOrejIrD9jAoejiRhiA+IyEoaYJvlh+D+3MngvnyDFqHiFZgngM0fvTnMTsgT
29
- avOOKhLsesocjiElkLMv8mwuY+L8P4tSvDTDKXxM9Bx/YagwgzYCqPoGtFdWI/GI
30
- +keKvrmaTOJ7CQ==
31
- -----END CERTIFICATE-----
32
-
33
- date: 2008-02-07 00:00:00 -08:00
34
- default_executable:
35
- dependencies:
36
- - !ruby/object:Gem::Dependency
37
- name: hoe
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
40
- requirements:
41
- - - ">="
42
- - !ruby/object:Gem::Version
43
- version: 1.5.0
44
- version:
45
- description: "== Features * Accepts documents in text/plain, text/xml and text/html format. * Basic access to the Open Calais API's Enlighten action. * Output is RDF representation of input document. * Single function ability to tag a document and receive a response in RDF format, names in the document, and their relationships. == Synopsis This is a very basic wrapper to the Open Calais API. It uses the POST endpoint and currently supports the Enlighten action. Here's a simple call: Calais.enlighten(:content => \"The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.\", :content_type => :text, :license_id => LICENSE_ID) This is the easiest way to get the RDF-formated response from the OpenCalais service. If you want to do something more fun like getting all sorts of fun information about a document, you can try this: Calais.process_document(:content => \"The government of the United Kingdom has given corporations like fast food chain McDonald's the right to award high school qualifications to employees who complete a company training program.\", :content_type => :text, :license_id => LICENSE_ID) This will return an object containing the RDF representation of the text, the names in the text, and any relationships that exist there."
46
- email: info@opensynapse.net
47
- executables: []
48
-
49
- extensions: []
50
-
51
- extra_rdoc_files:
52
- - History.txt
53
- - Manifest.txt
54
- - README.txt
28
+ post_install_message:
29
+ authors:
30
+ - Abhay Kumar
55
31
  files:
56
32
  - History.txt
57
33
  - MIT-LICENSE
@@ -71,32 +47,28 @@ files:
71
47
  - spec/fixtures/slovenia_euro.xml
72
48
  - spec/helper.rb
73
49
  - spec/spec.opts
74
- has_rdoc: true
75
- homepage: http://calais.rubyforge.org
76
- post_install_message:
50
+ test_files: []
51
+
77
52
  rdoc_options:
78
53
  - --main
79
54
  - README.txt
80
- require_paths:
81
- - lib
82
- required_ruby_version: !ruby/object:Gem::Requirement
83
- requirements:
84
- - - ">="
85
- - !ruby/object:Gem::Version
86
- version: "0"
87
- version:
88
- required_rubygems_version: !ruby/object:Gem::Requirement
89
- requirements:
90
- - - ">="
91
- - !ruby/object:Gem::Version
92
- version: "0"
93
- version:
94
- requirements: []
55
+ extra_rdoc_files:
56
+ - History.txt
57
+ - Manifest.txt
58
+ - README.txt
59
+ executables: []
95
60
 
96
- rubyforge_project: calais
97
- rubygems_version: 1.0.1
98
- signing_key:
99
- specification_version: 2
100
- summary: A Ruby interface to the Calais Web Service
101
- test_files: []
61
+ extensions: []
62
+
63
+ requirements: []
102
64
 
65
+ dependencies:
66
+ - !ruby/object:Gem::Dependency
67
+ name: hoe
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Version::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: 1.4.0
74
+ version:
data.tar.gz.sig DELETED
@@ -1 +0,0 @@
1
- ��i/>�[ɵ:��m��<����
metadata.gz.sig DELETED
Binary file