jakal 0.0.9 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -21,7 +21,7 @@ Feature: Calais-Specific features
21
21
  Scenario: Get nested tags from calais
22
22
  Given I have some simple text
23
23
  When I request the nested entities from calais
24
- Then I should get a decent response
24
+ Then I should receive the entities grouped into categories
25
25
 
26
26
  Scenario: Clean up blank items from a calais response
27
27
  Given I have a mock calais response
@@ -44,9 +44,9 @@ end
44
44
 
45
45
  When /^I request the nested entities from calais$/ do
46
46
  key = YAML::load_file('config/keys.yml')['calais']
47
- @response = Jkl::get_calais_metadata key, @text
47
+ @response = Jkl::tags key, @text
48
48
  end
49
49
 
50
- Then /^I should get a decent response$/ do
50
+ Then /^I should receive the entities grouped into categories$/ do
51
51
  @response.eql?({"Person"=>["Barack Obama", "Hillary Clinton"], "Position"=>["Secretary of State"]}).should == true
52
52
  end
data/lib/jkl.rb CHANGED
@@ -6,43 +6,51 @@ require "jkl/url_doc_handler.rb"
6
6
 
7
7
  module Jkl
8
8
 
9
- def self.headlines(feed, keyphrase)
10
- get_from_as_xml "#{feed}#{keyphrase}"
11
- end
9
+ class << self
12
10
 
13
- def self.pages(headlines)
14
- items = get_items_from headlines
15
- descriptions = ""
16
- items.each do |item|
17
- descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
11
+ def headlines(feed, keyphrase)
12
+ get_from_as_xml "#{feed}#{keyphrase}"
18
13
  end
19
- descriptions
20
- end
21
-
22
- def self.descriptions(headlines)
23
- items = get_items_from headlines
24
- descriptions = []
25
- items.each do |item|
26
- descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
14
+
15
+ def pages(headlines)
16
+ items = get_items_from headlines
17
+ descriptions = ""
18
+ items.each do |item|
19
+ descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
20
+ end
21
+ descriptions
27
22
  end
28
- descriptions
29
- end
30
-
31
- def self.links(headlines)
32
- items = get_items_from headlines
33
- links = []
34
- items.each do |item|
35
- links << attribute_from(item, :link)
23
+
24
+ def descriptions(headlines)
25
+ items = get_items_from headlines
26
+ descriptions = []
27
+ items.each do |item|
28
+ descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
29
+ end
30
+ descriptions
31
+ end
32
+
33
+ def links(headlines)
34
+ items = get_items_from headlines
35
+ links = []
36
+ items.each do |item|
37
+ links << attribute_from(item, :link)
38
+ end
39
+ links
36
40
  end
37
- links
38
- end
39
41
 
40
- def self.tags(key, pages)
41
- Calais.process_document(
42
- :content => pages,
43
- :content_type => :text,
44
- :license_id => key
45
- )
42
+ def tags(key, text)
43
+ nested_list = {}
44
+ entities(key,text).each do |a|
45
+ nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
46
+ end
47
+ nested_list
48
+ end
49
+
50
+ def entities(key,text)
51
+ calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
52
+ end
53
+
46
54
  end
47
55
 
48
56
  end
@@ -4,62 +4,69 @@ require "calais"
4
4
 
5
5
  module Jkl
6
6
 
7
- def self.get_from_calais(content)
8
- begin
9
- license_id = YAML::load_file('config/keys.yml')['calais']
10
- c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
11
- post_args = { 'licenseID' => license_id, 'content' => content,
12
- 'paramsXML' => paramsXML('application/json') }
13
- post_to(c_uri, post_args)
14
- rescue Exception => e
15
- puts e
7
+ class << self
8
+
9
+ #using the calais gem
10
+ def calais_response(key, pages)
11
+ Calais.process_document(
12
+ :content => pages,
13
+ :content_type => :text,
14
+ :license_id => key
15
+ )
16
16
  end
17
- end
18
17
 
19
- def self.get_tag_from_json(response)
20
- result = JSON.parse response
21
- result.delete_if {|key, value| key == "doc" } # ditching the doc
22
- cleaned_result = []
23
- result.each do |key,tag|
24
- tag = Jkl::clean_unwanted_items_from_hash tag
25
- cleaned_result << tag
26
- yield tag if block_given?
18
+ def get_from_calais(content)
19
+ begin
20
+ license_id = YAML::load_file('config/keys.yml')['calais']
21
+ c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
22
+ post_args = { 'licenseID' => license_id, 'content' => content,
23
+ 'paramsXML' => paramsXML('application/json') }
24
+ post_to(c_uri, post_args)
25
+ rescue Exception => e
26
+ puts e
27
+ end
27
28
  end
29
+
30
+ def get_tag_from_json(response)
31
+ result = JSON.parse response
32
+ result.delete_if {|key, value| key == "doc" } # ditching the doc
33
+ cleaned_result = []
34
+ result.each do |key,tag|
35
+ tag = Jkl::clean_unwanted_items_from_hash tag
36
+ cleaned_result << tag
37
+ yield tag if block_given?
38
+ end
28
39
 
29
- cleaned_result
30
- end
31
-
32
- def self.get_calais_metadata(key, text)
33
- name_value_pairs = Jkl::tags(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
34
- nested_list = {}
35
- name_value_pairs.each { |a| nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }}
36
- nested_list
37
- end
40
+ cleaned_result
41
+ end
38
42
 
39
- #jkl doesn't work with these aspects of the calais response, also removing blanks
40
- def self.clean_unwanted_items_from_hash h
41
- h.delete_if {|k, v| k == "relevance" }
42
- h.delete_if {|k, v| k == "instances" }
43
- h.delete_if {|k, v| v == "N/A"}
44
- h.delete_if {|k, v| v == []}
45
- h.delete_if {|k, v| v == ""}
46
- h.delete_if {|k, v| k == "_typeGroup"}
47
- h
48
- end
43
+ #jkl doesn't work with these aspects of the calais response, also removing blanks
44
+ def clean_unwanted_items_from_hash h
45
+ h.delete_if {|k, v| k == "relevance" }
46
+ h.delete_if {|k, v| k == "instances" }
47
+ h.delete_if {|k, v| v == "N/A"}
48
+ h.delete_if {|k, v| v == []}
49
+ h.delete_if {|k, v| v == ""}
50
+ h.delete_if {|k, v| k == "_typeGroup"}
51
+ h
52
+ end
49
53
 
50
- private
54
+ private
55
+
56
+ def paramsXML(format)
57
+ <<-paramsXML;
58
+ <c:params xmlns:c="http://s.opencalais.com/1/pred/"
59
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
60
+ <c:processingDirectives
61
+ c:contentType="text/txt"
62
+ c:outputFormat="#{format}">
63
+ </c:processingDirectives>
64
+ <c:userDirectives />
65
+ <c:externalMetadata />
66
+ </c:params>
67
+ paramsXML
68
+ end
51
69
 
52
- def self.paramsXML(format)
53
- <<-paramsXML;
54
- <c:params xmlns:c="http://s.opencalais.com/1/pred/"
55
- xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
56
- <c:processingDirectives
57
- c:contentType="text/txt"
58
- c:outputFormat="#{format}">
59
- </c:processingDirectives>
60
- <c:userDirectives />
61
- <c:externalMetadata />
62
- </c:params>
63
- paramsXML
64
70
  end
71
+
65
72
  end
@@ -3,34 +3,38 @@ require 'hpricot'
3
3
 
4
4
  module Jkl
5
5
 
6
- def self.post_to(uri, post_args)
7
- begin
8
- resp, data = Net::HTTP.post_form(uri, post_args)
9
- data
10
- rescue URI::InvalidURIError => e
11
- puts("WARN: Invalid URI: #{e}")
12
- rescue SocketError => e
13
- puts("WARN: Could not connect: #{e}")
14
- rescue Errno::ECONNREFUSED => e
15
- puts("WARN: Connection refused: #{e}")
6
+ class << self
7
+
8
+ def post_to(uri, post_args)
9
+ begin
10
+ resp, data = Net::HTTP.post_form(uri, post_args)
11
+ data
12
+ rescue URI::InvalidURIError => e
13
+ puts("WARN: Invalid URI: #{e}")
14
+ rescue SocketError => e
15
+ puts("WARN: Could not connect: #{e}")
16
+ rescue Errno::ECONNREFUSED => e
17
+ puts("WARN: Connection refused: #{e}")
18
+ end
16
19
  end
17
- end
18
20
 
19
- def self.get_from(uri)
20
- begin
21
- res = Net::HTTP.get_response(URI.parse(uri))
22
- res.body
23
- rescue URI::InvalidURIError => e
24
- puts("WARN: Invalid URI: #{e}")
25
- rescue SocketError => e
26
- puts("WARN: Could not connect: #{e}")
27
- rescue Errno::ECONNREFUSED => e
28
- puts("WARN: Connection refused: #{e}")
21
+ def get_from(uri)
22
+ begin
23
+ res = Net::HTTP.get_response(URI.parse(uri))
24
+ res.body
25
+ rescue URI::InvalidURIError => e
26
+ puts("WARN: Invalid URI: #{e}")
27
+ rescue SocketError => e
28
+ puts("WARN: Could not connect: #{e}")
29
+ rescue Errno::ECONNREFUSED => e
30
+ puts("WARN: Connection refused: #{e}")
31
+ end
29
32
  end
30
- end
31
33
 
32
- def self.get_from_as_xml(uri)
33
- Hpricot.XML get_from uri
34
- end
34
+ def get_from_as_xml(uri)
35
+ Hpricot.XML get_from uri
36
+ end
35
37
 
38
+ end
39
+
36
40
  end
@@ -2,14 +2,18 @@ require 'hpricot'
2
2
 
3
3
  module Jkl
4
4
 
5
- def self.get_items_from(rssdoc)
6
- items = []
7
- (rssdoc/:item).each { |rssitem| items.push rssitem } unless rssdoc==nil
8
- items
9
- end
5
+ class << self
6
+
7
+ def get_items_from(rssdoc)
8
+ items = []
9
+ (rssdoc/:item).each { |rssitem| items.push rssitem } unless rssdoc==nil
10
+ items
11
+ end
12
+
13
+ def attribute_from(item, name)
14
+ (item/name).inner_html
15
+ end
10
16
 
11
- def self.attribute_from(item, name)
12
- (item/name).inner_html
13
17
  end
14
18
 
15
19
  end
@@ -3,29 +3,33 @@ require 'rest_client'
3
3
 
4
4
  module Jkl
5
5
 
6
- def self.sanitize(text)
7
- str = ""
8
- text = text.to_s.gsub(/((<[\s\/]*script\b[^>]*>)([^>]*)(<\/script>))/i,"") #remove script tags - with contents
9
- text.to_s.gsub(/<\/?[^>]*>/, "").split("\r").each do |l| # remove all tags
10
- l = l.gsub(/^[ \t]/,"") #remove tabs
11
- l = l.gsub(/^[ \s]/,"")
12
- l.split("\n").each do |l|
13
- str << l unless l.count(" ") < 5 # remove short lines - ususally just navigation
6
+ class << self
7
+
8
+ def sanitize(text)
9
+ str = ""
10
+ text = text.to_s.gsub(/((<[\s\/]*script\b[^>]*>)([^>]*)(<\/script>))/i,"") #remove script tags - with contents
11
+ text.to_s.gsub(/<\/?[^>]*>/, "").split("\r").each do |l| # remove all tags
12
+ l = l.gsub(/^[ \t]/,"") #remove tabs
13
+ l = l.gsub(/^[ \s]/,"")
14
+ l.split("\n").each do |l|
15
+ str << l unless l.count(" ") < 5 # remove short lines - ususally just navigation
16
+ end
14
17
  end
18
+ str
15
19
  end
16
- str
17
- end
18
-
19
- def self.from_doc(response)
20
- begin
21
- Hpricot(response)
22
- rescue URI::InvalidURIError => e
23
- puts("WARN: Problem with getting a connection: #{e}")
24
- rescue SocketError => e
25
- puts("WARN: Could not connect to feed: #{e}")
26
- rescue Errno::ECONNREFUSED => e
27
- puts("WARN: Connection refused: #{e}")
20
+
21
+ def from_doc(response)
22
+ begin
23
+ Hpricot(response)
24
+ rescue URI::InvalidURIError => e
25
+ puts("WARN: Problem with getting a connection: #{e}")
26
+ rescue SocketError => e
27
+ puts("WARN: Could not connect to feed: #{e}")
28
+ rescue Errno::ECONNREFUSED => e
29
+ puts("WARN: Connection refused: #{e}")
30
+ end
28
31
  end
32
+
29
33
  end
30
34
 
31
35
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jakal
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - sshingler
@@ -13,7 +13,7 @@ date: 2009-08-27 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
16
- description: Jakal is a Ruby library for dealing with information overload.
16
+ description: Jakal is a Ruby library which contains some utilies for connecting to internet based APIs.
17
17
  email: "'shingler@gmail.com'"
18
18
  executables: []
19
19
 
@@ -30,7 +30,6 @@ files:
30
30
  - lib/jkl/url_doc_handler.rb
31
31
  - features/calais.feature
32
32
  - features/http.feature
33
- - features/processing.feature
34
33
  - features/sanitize-text.feature
35
34
  - features/mocks/bbc_story.html
36
35
  - features/mocks/calais.json
@@ -38,7 +37,6 @@ files:
38
37
  - features/mocks/twitter.json
39
38
  - features/step_definitions/calais_steps.rb
40
39
  - features/step_definitions/http_steps.rb
41
- - features/step_definitions/processing_steps.rb
42
40
  - features/step_definitions/require_steps.rb
43
41
  - features/step_definitions/sanitize-text_steps.rb
44
42
  - features/step_definitions/twitter_steps.rb
@@ -73,6 +71,6 @@ rubyforge_project:
73
71
  rubygems_version: 1.3.5
74
72
  signing_key:
75
73
  specification_version: 3
76
- summary: Jakal is a Ruby library for dealing with information overload.
74
+ summary: Jakal is a Ruby library which contains some utilies for connecting to internet based APIs.
77
75
  test_files: []
78
76
 
@@ -1,16 +0,0 @@
1
- Feature: Processing features
2
- In order to integrate our apps
3
- As a developer
4
- I want to make some requests and inspect some responses
5
-
6
- @connection_needed
7
- Scenario: end to end flow, keyphrase to tags
8
- Given I have a keyphrase 'london restaurants'
9
- When I request some RSS
10
- Then I should receive some headlines
11
- And I should be able to get the copy from the first headline
12
- When I post to calais
13
- Then I should receive some tags
14
- And I should be able to persist these tags
15
- When I generate a view of the recent keyword results
16
- Then I should see a network graph
@@ -1,30 +0,0 @@
1
-
2
- ############### pending steps below ################
3
-
4
-
5
- When /^I request tags for the first story$/ do
6
- r = Jkl::get_from_calais @story
7
- Jkl::get_tag_from_json(get_from_calais(@story)) do |tag|
8
- tag.each{|k,v| puts "#{k} : #{v}"}
9
- end
10
- end
11
-
12
- When /^I request stories from Topix$/ do
13
- search_term = 'london'
14
- url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
15
- @response = Jkl::get_from_as_xml url
16
- end
17
-
18
- When /^I get some news stories from the first keyword$/ do
19
- search_term = @trend['name'].gsub('#','') #removing hash from start of trend name
20
- search_term = 'london'
21
- url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
22
- rss_response = Jkl::get_from_as_xml url
23
- items = Jkl::get_items_from rss_response
24
- links = []
25
- items.each do |item|
26
- links << attribute_from(item, :link)
27
- end
28
- @story = Jkl::sanitize Jkl::from_doc Jkl::get_from links[0]
29
- end
30
-