jakal 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/features/http.feature +0 -6
- data/features/step_definitions/calais_steps.rb +3 -2
- data/features/step_definitions/http_steps.rb +3 -4
- data/lib/jkl.rb +0 -48
- data/lib/jkl/calais_client.rb +66 -57
- data/lib/jkl/rest_client.rb +2 -4
- data/lib/jkl/rss_client.rb +20 -12
- metadata +1 -1
data/features/http.feature
CHANGED
@@ -13,7 +13,8 @@ Given /^I have a mock calais response$/ do
|
|
13
13
|
end
|
14
14
|
|
15
15
|
When /^I post to calais$/ do
|
16
|
-
|
16
|
+
key = YAML::load_file('config/keys.yml')['calais']
|
17
|
+
@response = Jkl::Extraction::get_from_calais(key, @text)
|
17
18
|
end
|
18
19
|
|
19
20
|
When /^I remove the unwanted items$/ do
|
@@ -44,7 +45,7 @@ end
|
|
44
45
|
|
45
46
|
When /^I request the nested entities from calais$/ do
|
46
47
|
key = YAML::load_file('config/keys.yml')['calais']
|
47
|
-
@response = Jkl::tags key, @text
|
48
|
+
@response = Jkl::Extraction::tags key, @text
|
48
49
|
end
|
49
50
|
|
50
51
|
Then /^I should receive the entities grouped into categories$/ do
|
@@ -9,7 +9,7 @@ end
|
|
9
9
|
When /^I request some RSS$/ do
|
10
10
|
keyphrase = @keyphrase || "iraq"
|
11
11
|
url = "#{YAML::load_file('config/config.yml')['topix']}#{CGI::escape(keyphrase)}"
|
12
|
-
@response = Jkl::
|
12
|
+
@response = Jkl::get_xml_from url
|
13
13
|
end
|
14
14
|
|
15
15
|
Given /^I have some RSS$/ do
|
@@ -28,17 +28,16 @@ When /^I request some trends$/ do
|
|
28
28
|
@response = output['trends']
|
29
29
|
end
|
30
30
|
|
31
|
-
|
32
31
|
Then /^I should get a response$/ do
|
33
32
|
@response.should_not == nil
|
34
33
|
#puts @response.inspect
|
35
34
|
end
|
36
35
|
|
37
36
|
Then /^I should receive some headlines$/ do
|
38
|
-
@items = Jkl::
|
37
|
+
@items = Jkl::Rss::items @response
|
39
38
|
@links = []
|
40
39
|
@items.each do |item|
|
41
|
-
@links << Jkl::attribute_from(item, :link)
|
40
|
+
@links << Jkl::Rss::attribute_from(item, :link)
|
42
41
|
end
|
43
42
|
@links.should_not == nil
|
44
43
|
@links.length.should > 0
|
data/lib/jkl.rb
CHANGED
@@ -1,56 +1,8 @@
|
|
1
|
-
require "cgi"
|
2
1
|
require "jkl/rest_client.rb"
|
3
2
|
require "jkl/rss_client.rb"
|
4
3
|
require "jkl/calais_client.rb"
|
5
4
|
require "jkl/url_doc_handler.rb"
|
6
5
|
|
7
6
|
module Jkl
|
8
|
-
|
9
|
-
class << self
|
10
7
|
|
11
|
-
def headlines(feed, keyphrase)
|
12
|
-
get_from_as_xml "#{feed}#{keyphrase}"
|
13
|
-
end
|
14
|
-
|
15
|
-
def pages(headlines)
|
16
|
-
items = get_items_from headlines
|
17
|
-
descriptions = ""
|
18
|
-
items.each do |item|
|
19
|
-
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
20
|
-
end
|
21
|
-
descriptions
|
22
|
-
end
|
23
|
-
|
24
|
-
def descriptions(headlines)
|
25
|
-
items = get_items_from headlines
|
26
|
-
descriptions = []
|
27
|
-
items.each do |item|
|
28
|
-
descriptions << attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
29
|
-
end
|
30
|
-
descriptions
|
31
|
-
end
|
32
|
-
|
33
|
-
def links(headlines)
|
34
|
-
items = get_items_from headlines
|
35
|
-
links = []
|
36
|
-
items.each do |item|
|
37
|
-
links << attribute_from(item, :link)
|
38
|
-
end
|
39
|
-
links
|
40
|
-
end
|
41
|
-
|
42
|
-
def tags(key, text)
|
43
|
-
nested_list = {}
|
44
|
-
entities(key,text).each do |a|
|
45
|
-
nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
|
46
|
-
end
|
47
|
-
nested_list
|
48
|
-
end
|
49
|
-
|
50
|
-
def entities(key,text)
|
51
|
-
calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
55
|
-
|
56
8
|
end
|
data/lib/jkl/calais_client.rb
CHANGED
@@ -1,72 +1,81 @@
|
|
1
1
|
require "json"
|
2
|
-
require "rest_client"
|
3
2
|
require "calais"
|
4
3
|
|
5
|
-
|
4
|
+
require "rest_client"
|
6
5
|
|
7
|
-
|
6
|
+
module Jkl
|
7
|
+
module Extraction
|
8
|
+
class << self
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
10
|
+
#using the calais gem
|
11
|
+
def calais_response(key, pages)
|
12
|
+
Calais.process_document(
|
13
|
+
:content => pages,
|
14
|
+
:content_type => :text,
|
15
|
+
:license_id => key
|
16
|
+
)
|
17
|
+
end
|
17
18
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
19
|
+
def tags(key, text)
|
20
|
+
nested_list = {}
|
21
|
+
entities(key,text).each do |a|
|
22
|
+
nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
|
23
|
+
end
|
24
|
+
nested_list
|
25
|
+
end
|
26
|
+
|
27
|
+
def entities(key,text)
|
28
|
+
calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
29
|
+
end
|
30
|
+
|
31
|
+
#not using calais gem, experimenting with json response
|
32
|
+
def get_from_calais(key, content)
|
33
|
+
post_args = {
|
34
|
+
"licenseID" => key,
|
35
|
+
"content" => content,
|
36
|
+
"paramsXML" => paramsXML("application/json")
|
37
|
+
}
|
38
|
+
Jkl::post_to(URI.parse("http://api.opencalais.com/enlighten/rest/"), post_args)
|
27
39
|
end
|
28
|
-
end
|
29
40
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
41
|
+
def get_tag_from_json(response)
|
42
|
+
result = JSON.parse response
|
43
|
+
result.delete_if {|key, value| key == "doc" } # ditching the doc
|
44
|
+
cleaned_result = []
|
45
|
+
result.each do |key,tag|
|
46
|
+
tag = Jkl::clean_unwanted_items_from_hash tag
|
47
|
+
cleaned_result << tag
|
48
|
+
yield tag if block_given?
|
49
|
+
end
|
50
|
+
cleaned_result
|
38
51
|
end
|
39
52
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
h.delete_if {|k, v| v == ""}
|
50
|
-
h.delete_if {|k, v| k == "_typeGroup"}
|
51
|
-
h
|
52
|
-
end
|
53
|
+
def clean_unwanted_items_from_hash h
|
54
|
+
h.delete_if {|k, v| k == "relevance" }
|
55
|
+
h.delete_if {|k, v| k == "instances" }
|
56
|
+
h.delete_if {|k, v| v == "N/A"}
|
57
|
+
h.delete_if {|k, v| v == []}
|
58
|
+
h.delete_if {|k, v| v == ""}
|
59
|
+
h.delete_if {|k, v| k == "_typeGroup"}
|
60
|
+
h
|
61
|
+
end
|
53
62
|
|
54
|
-
|
63
|
+
private
|
55
64
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
65
|
+
def paramsXML(format)
|
66
|
+
<<-paramsXML;
|
67
|
+
<c:params xmlns:c="http://s.opencalais.com/1/pred/"
|
68
|
+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
69
|
+
<c:processingDirectives
|
70
|
+
c:contentType="text/txt"
|
71
|
+
c:outputFormat="#{format}">
|
72
|
+
</c:processingDirectives>
|
73
|
+
<c:userDirectives />
|
74
|
+
<c:externalMetadata />
|
75
|
+
</c:params>
|
76
|
+
paramsXML
|
77
|
+
end
|
69
78
|
|
79
|
+
end
|
70
80
|
end
|
71
|
-
|
72
81
|
end
|
data/lib/jkl/rest_client.rb
CHANGED
@@ -2,7 +2,6 @@ require 'net/http'
|
|
2
2
|
require 'hpricot'
|
3
3
|
|
4
4
|
module Jkl
|
5
|
-
|
6
5
|
class << self
|
7
6
|
|
8
7
|
def post_to(uri, post_args)
|
@@ -31,10 +30,9 @@ module Jkl
|
|
31
30
|
end
|
32
31
|
end
|
33
32
|
|
34
|
-
def
|
33
|
+
def get_xml_from(uri)
|
35
34
|
Hpricot.XML get_from uri
|
36
35
|
end
|
37
36
|
|
38
37
|
end
|
39
|
-
|
40
|
-
end
|
38
|
+
end
|
data/lib/jkl/rss_client.rb
CHANGED
@@ -1,19 +1,27 @@
|
|
1
1
|
require 'hpricot'
|
2
2
|
|
3
3
|
module Jkl
|
4
|
-
|
5
|
-
|
4
|
+
module Rss
|
5
|
+
class << self
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
7
|
+
def items(rss_doc)
|
8
|
+
(rss_doc/:item)
|
9
|
+
end
|
10
|
+
|
11
|
+
def links(items)
|
12
|
+
items.map{|item| attribute_from(item,:link)}
|
13
|
+
end
|
14
|
+
|
15
|
+
def descriptions(items)
|
16
|
+
items.map do |item|
|
17
|
+
attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def attribute_from(item, name)
|
22
|
+
(item/name).inner_html
|
23
|
+
end
|
24
|
+
|
15
25
|
end
|
16
|
-
|
17
26
|
end
|
18
|
-
|
19
27
|
end
|