jakal 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,7 +21,7 @@ Feature: Calais-Specific features
21
21
  Scenario: Get nested tags from calais
22
22
  Given I have some simple text
23
23
  When I request the nested entities from calais
24
- Then I should get a decent response
24
+ Then I should receive the entities grouped into categories
25
25
 
26
26
  Scenario: Clean up blank items from a calais response
27
27
  Given I have a mock calais response
@@ -44,9 +44,9 @@ end
44
44
 
45
45
  When /^I request the nested entities from calais$/ do
46
46
  key = YAML::load_file('config/keys.yml')['calais']
47
- @response = Jkl::get_calais_metadata key, @text
47
+ @response = Jkl::tags key, @text
48
48
  end
49
49
 
50
- Then /^I should get a decent response$/ do
50
+ Then /^I should receive the entities grouped into categories$/ do
51
51
  @response.eql?({"Person"=>["Barack Obama", "Hillary Clinton"], "Position"=>["Secretary of State"]}).should == true
52
52
  end
data/lib/jkl.rb CHANGED
@@ -6,43 +6,51 @@ require "jkl/url_doc_handler.rb"
6
6
 
7
7
  module Jkl
8
8
 
9
- def self.headlines(feed, keyphrase)
10
- get_from_as_xml "#{feed}#{keyphrase}"
11
- end
9
+ class << self
12
10
 
13
- def self.pages(headlines)
14
- items = get_items_from headlines
15
- descriptions = ""
16
- items.each do |item|
17
- descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
11
+ def headlines(feed, keyphrase)
12
+ get_from_as_xml "#{feed}#{keyphrase}"
18
13
  end
19
- descriptions
20
- end
21
-
22
- def self.descriptions(headlines)
23
- items = get_items_from headlines
24
- descriptions = []
25
- items.each do |item|
26
- descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
14
+
15
+ def pages(headlines)
16
+ items = get_items_from headlines
17
+ descriptions = ""
18
+ items.each do |item|
19
+ descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
20
+ end
21
+ descriptions
27
22
  end
28
- descriptions
29
- end
30
-
31
- def self.links(headlines)
32
- items = get_items_from headlines
33
- links = []
34
- items.each do |item|
35
- links << attribute_from(item, :link)
23
+
24
+ def descriptions(headlines)
25
+ items = get_items_from headlines
26
+ descriptions = []
27
+ items.each do |item|
28
+ descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
29
+ end
30
+ descriptions
31
+ end
32
+
33
+ def links(headlines)
34
+ items = get_items_from headlines
35
+ links = []
36
+ items.each do |item|
37
+ links << attribute_from(item, :link)
38
+ end
39
+ links
36
40
  end
37
- links
38
- end
39
41
 
40
- def self.tags(key, pages)
41
- Calais.process_document(
42
- :content => pages,
43
- :content_type => :text,
44
- :license_id => key
45
- )
42
+ def tags(key, text)
43
+ nested_list = {}
44
+ entities(key,text).each do |a|
45
+ nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
46
+ end
47
+ nested_list
48
+ end
49
+
50
+ def entities(key,text)
51
+ calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
52
+ end
53
+
46
54
  end
47
55
 
48
56
  end
@@ -4,62 +4,69 @@ require "calais"
4
4
 
5
5
  module Jkl
6
6
 
7
- def self.get_from_calais(content)
8
- begin
9
- license_id = YAML::load_file('config/keys.yml')['calais']
10
- c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
11
- post_args = { 'licenseID' => license_id, 'content' => content,
12
- 'paramsXML' => paramsXML('application/json') }
13
- post_to(c_uri, post_args)
14
- rescue Exception => e
15
- puts e
7
+ class << self
8
+
9
+ #using the calais gem
10
+ def calais_response(key, pages)
11
+ Calais.process_document(
12
+ :content => pages,
13
+ :content_type => :text,
14
+ :license_id => key
15
+ )
16
16
  end
17
- end
18
17
 
19
- def self.get_tag_from_json(response)
20
- result = JSON.parse response
21
- result.delete_if {|key, value| key == "doc" } # ditching the doc
22
- cleaned_result = []
23
- result.each do |key,tag|
24
- tag = Jkl::clean_unwanted_items_from_hash tag
25
- cleaned_result << tag
26
- yield tag if block_given?
18
+ def get_from_calais(content)
19
+ begin
20
+ license_id = YAML::load_file('config/keys.yml')['calais']
21
+ c_uri = URI.parse('http://api.opencalais.com/enlighten/rest/')
22
+ post_args = { 'licenseID' => license_id, 'content' => content,
23
+ 'paramsXML' => paramsXML('application/json') }
24
+ post_to(c_uri, post_args)
25
+ rescue Exception => e
26
+ puts e
27
+ end
27
28
  end
29
+
30
+ def get_tag_from_json(response)
31
+ result = JSON.parse response
32
+ result.delete_if {|key, value| key == "doc" } # ditching the doc
33
+ cleaned_result = []
34
+ result.each do |key,tag|
35
+ tag = Jkl::clean_unwanted_items_from_hash tag
36
+ cleaned_result << tag
37
+ yield tag if block_given?
38
+ end
28
39
 
29
- cleaned_result
30
- end
31
-
32
- def self.get_calais_metadata(key, text)
33
- name_value_pairs = Jkl::tags(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
34
- nested_list = {}
35
- name_value_pairs.each { |a| nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }}
36
- nested_list
37
- end
40
+ cleaned_result
41
+ end
38
42
 
39
- #jkl doesn't work with these aspects of the calais response, also removing blanks
40
- def self.clean_unwanted_items_from_hash h
41
- h.delete_if {|k, v| k == "relevance" }
42
- h.delete_if {|k, v| k == "instances" }
43
- h.delete_if {|k, v| v == "N/A"}
44
- h.delete_if {|k, v| v == []}
45
- h.delete_if {|k, v| v == ""}
46
- h.delete_if {|k, v| k == "_typeGroup"}
47
- h
48
- end
43
+ #jkl doesn't work with these aspects of the calais response, also removing blanks
44
+ def clean_unwanted_items_from_hash h
45
+ h.delete_if {|k, v| k == "relevance" }
46
+ h.delete_if {|k, v| k == "instances" }
47
+ h.delete_if {|k, v| v == "N/A"}
48
+ h.delete_if {|k, v| v == []}
49
+ h.delete_if {|k, v| v == ""}
50
+ h.delete_if {|k, v| k == "_typeGroup"}
51
+ h
52
+ end
49
53
 
50
- private
54
+ private
55
+
56
+ def paramsXML(format)
57
+ <<-paramsXML;
58
+ <c:params xmlns:c="http://s.opencalais.com/1/pred/"
59
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
60
+ <c:processingDirectives
61
+ c:contentType="text/txt"
62
+ c:outputFormat="#{format}">
63
+ </c:processingDirectives>
64
+ <c:userDirectives />
65
+ <c:externalMetadata />
66
+ </c:params>
67
+ paramsXML
68
+ end
51
69
 
52
- def self.paramsXML(format)
53
- <<-paramsXML;
54
- <c:params xmlns:c="http://s.opencalais.com/1/pred/"
55
- xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
56
- <c:processingDirectives
57
- c:contentType="text/txt"
58
- c:outputFormat="#{format}">
59
- </c:processingDirectives>
60
- <c:userDirectives />
61
- <c:externalMetadata />
62
- </c:params>
63
- paramsXML
64
70
  end
71
+
65
72
  end
@@ -3,34 +3,38 @@ require 'hpricot'
3
3
 
4
4
  module Jkl
5
5
 
6
- def self.post_to(uri, post_args)
7
- begin
8
- resp, data = Net::HTTP.post_form(uri, post_args)
9
- data
10
- rescue URI::InvalidURIError => e
11
- puts("WARN: Invalid URI: #{e}")
12
- rescue SocketError => e
13
- puts("WARN: Could not connect: #{e}")
14
- rescue Errno::ECONNREFUSED => e
15
- puts("WARN: Connection refused: #{e}")
6
+ class << self
7
+
8
+ def post_to(uri, post_args)
9
+ begin
10
+ resp, data = Net::HTTP.post_form(uri, post_args)
11
+ data
12
+ rescue URI::InvalidURIError => e
13
+ puts("WARN: Invalid URI: #{e}")
14
+ rescue SocketError => e
15
+ puts("WARN: Could not connect: #{e}")
16
+ rescue Errno::ECONNREFUSED => e
17
+ puts("WARN: Connection refused: #{e}")
18
+ end
16
19
  end
17
- end
18
20
 
19
- def self.get_from(uri)
20
- begin
21
- res = Net::HTTP.get_response(URI.parse(uri))
22
- res.body
23
- rescue URI::InvalidURIError => e
24
- puts("WARN: Invalid URI: #{e}")
25
- rescue SocketError => e
26
- puts("WARN: Could not connect: #{e}")
27
- rescue Errno::ECONNREFUSED => e
28
- puts("WARN: Connection refused: #{e}")
21
+ def get_from(uri)
22
+ begin
23
+ res = Net::HTTP.get_response(URI.parse(uri))
24
+ res.body
25
+ rescue URI::InvalidURIError => e
26
+ puts("WARN: Invalid URI: #{e}")
27
+ rescue SocketError => e
28
+ puts("WARN: Could not connect: #{e}")
29
+ rescue Errno::ECONNREFUSED => e
30
+ puts("WARN: Connection refused: #{e}")
31
+ end
29
32
  end
30
- end
31
33
 
32
- def self.get_from_as_xml(uri)
33
- Hpricot.XML get_from uri
34
- end
34
+ def get_from_as_xml(uri)
35
+ Hpricot.XML get_from uri
36
+ end
35
37
 
38
+ end
39
+
36
40
  end
@@ -2,14 +2,18 @@ require 'hpricot'
2
2
 
3
3
  module Jkl
4
4
 
5
- def self.get_items_from(rssdoc)
6
- items = []
7
- (rssdoc/:item).each { |rssitem| items.push rssitem } unless rssdoc==nil
8
- items
9
- end
5
+ class << self
6
+
7
+ def get_items_from(rssdoc)
8
+ items = []
9
+ (rssdoc/:item).each { |rssitem| items.push rssitem } unless rssdoc==nil
10
+ items
11
+ end
12
+
13
+ def attribute_from(item, name)
14
+ (item/name).inner_html
15
+ end
10
16
 
11
- def self.attribute_from(item, name)
12
- (item/name).inner_html
13
17
  end
14
18
 
15
19
  end
@@ -3,29 +3,33 @@ require 'rest_client'
3
3
 
4
4
  module Jkl
5
5
 
6
- def self.sanitize(text)
7
- str = ""
8
- text = text.to_s.gsub(/((<[\s\/]*script\b[^>]*>)([^>]*)(<\/script>))/i,"") #remove script tags - with contents
9
- text.to_s.gsub(/<\/?[^>]*>/, "").split("\r").each do |l| # remove all tags
10
- l = l.gsub(/^[ \t]/,"") #remove tabs
11
- l = l.gsub(/^[ \s]/,"")
12
- l.split("\n").each do |l|
13
- str << l unless l.count(" ") < 5 # remove short lines - ususally just navigation
6
+ class << self
7
+
8
+ def sanitize(text)
9
+ str = ""
10
+ text = text.to_s.gsub(/((<[\s\/]*script\b[^>]*>)([^>]*)(<\/script>))/i,"") #remove script tags - with contents
11
+ text.to_s.gsub(/<\/?[^>]*>/, "").split("\r").each do |l| # remove all tags
12
+ l = l.gsub(/^[ \t]/,"") #remove tabs
13
+ l = l.gsub(/^[ \s]/,"")
14
+ l.split("\n").each do |l|
15
+ str << l unless l.count(" ") < 5 # remove short lines - ususally just navigation
16
+ end
14
17
  end
18
+ str
15
19
  end
16
- str
17
- end
18
-
19
- def self.from_doc(response)
20
- begin
21
- Hpricot(response)
22
- rescue URI::InvalidURIError => e
23
- puts("WARN: Problem with getting a connection: #{e}")
24
- rescue SocketError => e
25
- puts("WARN: Could not connect to feed: #{e}")
26
- rescue Errno::ECONNREFUSED => e
27
- puts("WARN: Connection refused: #{e}")
20
+
21
+ def from_doc(response)
22
+ begin
23
+ Hpricot(response)
24
+ rescue URI::InvalidURIError => e
25
+ puts("WARN: Problem with getting a connection: #{e}")
26
+ rescue SocketError => e
27
+ puts("WARN: Could not connect to feed: #{e}")
28
+ rescue Errno::ECONNREFUSED => e
29
+ puts("WARN: Connection refused: #{e}")
30
+ end
28
31
  end
32
+
29
33
  end
30
34
 
31
35
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jakal
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.9
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - sshingler
@@ -13,7 +13,7 @@ date: 2009-08-27 00:00:00 +01:00
13
13
  default_executable:
14
14
  dependencies: []
15
15
 
16
- description: Jakal is a Ruby library for dealing with information overload.
16
+ description: Jakal is a Ruby library which contains some utilies for connecting to internet based APIs.
17
17
  email: "'shingler@gmail.com'"
18
18
  executables: []
19
19
 
@@ -30,7 +30,6 @@ files:
30
30
  - lib/jkl/url_doc_handler.rb
31
31
  - features/calais.feature
32
32
  - features/http.feature
33
- - features/processing.feature
34
33
  - features/sanitize-text.feature
35
34
  - features/mocks/bbc_story.html
36
35
  - features/mocks/calais.json
@@ -38,7 +37,6 @@ files:
38
37
  - features/mocks/twitter.json
39
38
  - features/step_definitions/calais_steps.rb
40
39
  - features/step_definitions/http_steps.rb
41
- - features/step_definitions/processing_steps.rb
42
40
  - features/step_definitions/require_steps.rb
43
41
  - features/step_definitions/sanitize-text_steps.rb
44
42
  - features/step_definitions/twitter_steps.rb
@@ -73,6 +71,6 @@ rubyforge_project:
73
71
  rubygems_version: 1.3.5
74
72
  signing_key:
75
73
  specification_version: 3
76
- summary: Jakal is a Ruby library for dealing with information overload.
74
+ summary: Jakal is a Ruby library which contains some utilies for connecting to internet based APIs.
77
75
  test_files: []
78
76
 
@@ -1,16 +0,0 @@
1
- Feature: Processing features
2
- In order to integrate our apps
3
- As a developer
4
- I want to make some requests and inspect some responses
5
-
6
- @connection_needed
7
- Scenario: end to end flow, keyphrase to tags
8
- Given I have a keyphrase 'london restaurants'
9
- When I request some RSS
10
- Then I should receive some headlines
11
- And I should be able to get the copy from the first headline
12
- When I post to calais
13
- Then I should receive some tags
14
- And I should be able to persist these tags
15
- When I generate a view of the recent keyword results
16
- Then I should see a network graph
@@ -1,30 +0,0 @@
1
-
2
- ############### pending steps below ################
3
-
4
-
5
- When /^I request tags for the first story$/ do
6
- r = Jkl::get_from_calais @story
7
- Jkl::get_tag_from_json(get_from_calais(@story)) do |tag|
8
- tag.each{|k,v| puts "#{k} : #{v}"}
9
- end
10
- end
11
-
12
- When /^I request stories from Topix$/ do
13
- search_term = 'london'
14
- url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
15
- @response = Jkl::get_from_as_xml url
16
- end
17
-
18
- When /^I get some news stories from the first keyword$/ do
19
- search_term = @trend['name'].gsub('#','') #removing hash from start of trend name
20
- search_term = 'london'
21
- url = "#{YAML::load_file('config/config.yml')['topix']}#{search_term}"
22
- rss_response = Jkl::get_from_as_xml url
23
- items = Jkl::get_items_from rss_response
24
- links = []
25
- items.each do |item|
26
- links << attribute_from(item, :link)
27
- end
28
- @story = Jkl::sanitize Jkl::from_doc Jkl::get_from links[0]
29
- end
30
-