jakal 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/jkl/calais_client.rb +4 -4
- data/lib/jkl/text_client.rb +1 -0
- data/lib/jkl.rb +9 -0
- data/{features/mocks → test/fixtures}/bbc_story.html +0 -0
- data/{features/mocks → test/fixtures}/topix_rss.xml +0 -0
- data/test/unit/jkl_test.rb +82 -0
- data/test/unit/text_cleaning_test.rb +72 -0
- metadata +9 -16
- data/features/calais.feature +0 -10
- data/features/http.feature +0 -20
- data/features/mocks/calais.json +0 -2464
- data/features/sanitize-text.feature +0 -71
- data/features/step_definitions/calais_steps.rb +0 -13
- data/features/step_definitions/http_steps.rb +0 -38
- data/features/step_definitions/sanitize-text_steps.rb +0 -79
- data/features/step_definitions/twitter_steps.rb +0 -13
- data/features/support/env.rb +0 -17
data/lib/jkl/calais_client.rb
CHANGED
@@ -11,6 +11,10 @@ module Jkl
|
|
11
11
|
)
|
12
12
|
end
|
13
13
|
|
14
|
+
def entities(key,text)
|
15
|
+
calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
16
|
+
end
|
17
|
+
|
14
18
|
def tags(key, text)
|
15
19
|
nested_list = {}
|
16
20
|
entities(key,text).each do |a|
|
@@ -19,10 +23,6 @@ module Jkl
|
|
19
23
|
nested_list
|
20
24
|
end
|
21
25
|
|
22
|
-
def entities(key,text)
|
23
|
-
calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
|
24
|
-
end
|
25
|
-
|
26
26
|
end
|
27
27
|
end
|
28
28
|
end
|
data/lib/jkl/text_client.rb
CHANGED
data/lib/jkl.rb
CHANGED
@@ -16,10 +16,19 @@ module Jkl
|
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
|
+
def topix_links(keyphrase)
|
20
|
+
url = YAML::load_file('config/config.yml')['topix']
|
21
|
+
links("#{url}#{keyphrase}")
|
22
|
+
end
|
23
|
+
|
19
24
|
def tags(key, link)
|
20
25
|
text = Jkl::Text::sanitize(Jkl::get_from(link))
|
21
26
|
Jkl::Extraction::tags(key, text)
|
22
27
|
end
|
23
28
|
|
29
|
+
def trends
|
30
|
+
url = YAML::load_file('config/config.yml')['twitter']
|
31
|
+
JSON.parse(Jkl::get_from(url))["trends"].map{|t| t["name"]}
|
32
|
+
end
|
24
33
|
end
|
25
34
|
end
|
File without changes
|
File without changes
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "shoulda"
|
3
|
+
require "webmock/test_unit"
|
4
|
+
require "yaml"
|
5
|
+
require "lib/jkl"
|
6
|
+
|
7
|
+
class JklTest < Test::Unit::TestCase
|
8
|
+
include WebMock
|
9
|
+
|
10
|
+
context "Using Jkl" do
|
11
|
+
setup do
|
12
|
+
stub_twitter
|
13
|
+
stub_topix
|
14
|
+
stub_news_article
|
15
|
+
end
|
16
|
+
|
17
|
+
should "GET trends" do
|
18
|
+
trends = Jkl::trends
|
19
|
+
assert trends.length == 10
|
20
|
+
assert trends[0] == "London"
|
21
|
+
end
|
22
|
+
|
23
|
+
should "GET news article URLS for a trend" do
|
24
|
+
articles = Jkl::topix_links(Jkl::trends[0])
|
25
|
+
assert articles.length == 2
|
26
|
+
assert articles[0] = "http://www.localnews8.com/Global/story.asp?S=10876507"
|
27
|
+
end
|
28
|
+
|
29
|
+
should "extract text from a news article" do
|
30
|
+
articles = Jkl::topix_links(Jkl::trends[0])
|
31
|
+
text = Jkl::Text::sanitize(Jkl::get_from(articles[0]))
|
32
|
+
assert_not_nil text
|
33
|
+
end
|
34
|
+
|
35
|
+
should "extract tags from some text" do
|
36
|
+
key = YAML::load_file('config/keys.yml')['calais']
|
37
|
+
text = <<-EOF
|
38
|
+
Barack Obama said today that he expects there
|
39
|
+
to be conflict within his new security team after
|
40
|
+
confirming Hillary Clinton as his choice for US Secretary of State."
|
41
|
+
EOF
|
42
|
+
tags = Jkl::Extraction::tags(key, text)
|
43
|
+
assert tags["Person"][0] == "Barack Obama"
|
44
|
+
puts Jkl::Extraction::entities(key,text)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
def stub_twitter
|
50
|
+
url = YAML::load_file('config/config.yml')['twitter']
|
51
|
+
response = <<-EOF
|
52
|
+
{"trends":[
|
53
|
+
{"name":"London","url":"http://search.twitter.com/search?q=London"},
|
54
|
+
{"name":"Geneva","url":"http://search.twitter.com/search?q=Geneva"},
|
55
|
+
{"name":"Kabul","url":"http://search.twitter.com/search?q=Kabul"},
|
56
|
+
{"name":"Chicago","url":"http://search.twitter.com/search?q=Chicago"},
|
57
|
+
{"name":"Cannes","url":"http://search.twitter.com/search?q=Cannes"},
|
58
|
+
{"name":"Verona","url":"http://search.twitter.com/search?q=Verona"},
|
59
|
+
{"name":"Milan","url":"http://search.twitter.com/search?q=Milan"},
|
60
|
+
{"name":"New York","url":"http://search.twitter.com/search?q=New%20York"},
|
61
|
+
{"name":"Paris","url":"http://search.twitter.com/search?q=Paris"},
|
62
|
+
{"name":"Melbourne","url":"http://search.twitter.com/search?q=Melbourne"}
|
63
|
+
],"as_of":"Sat, 1 Jan 1970 00:00:00 +0000"}
|
64
|
+
EOF
|
65
|
+
stub_request(:get, url).to_return(:body => response)
|
66
|
+
end
|
67
|
+
def stub_topix
|
68
|
+
url = YAML::load_file('config/config.yml')['topix']
|
69
|
+
response = raw = File.open('test/fixtures/topix_rss.xml','r') do |file|
|
70
|
+
file.readlines.to_s
|
71
|
+
end
|
72
|
+
stub_request(:get, "#{url}London").to_return(:body => response)
|
73
|
+
end
|
74
|
+
def stub_news_article
|
75
|
+
response = raw = File.open('test/fixtures/bbc_story.html','r') do |file|
|
76
|
+
file.readlines.to_s
|
77
|
+
end
|
78
|
+
stub_request(:get, "http://www.localnews8.com/Global/story.asp?S=10876507").to_return(
|
79
|
+
:body => response
|
80
|
+
)
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "shoulda"
|
3
|
+
require "webmock/test_unit"
|
4
|
+
require "yaml"
|
5
|
+
require "lib/jkl"
|
6
|
+
|
7
|
+
class TextCleaningTest < Test::Unit::TestCase
|
8
|
+
context "Cleaning Text" do
|
9
|
+
|
10
|
+
should "Remove short lines" do
|
11
|
+
input = <<-HTML
|
12
|
+
the cat sat on the mat
|
13
|
+
a short line
|
14
|
+
HTML
|
15
|
+
result = Jkl::Text::remove_short_lines input
|
16
|
+
assert result == "the cat sat on the mat"
|
17
|
+
end
|
18
|
+
|
19
|
+
should "Remove script tags" do
|
20
|
+
input = <<-HTML
|
21
|
+
the cat sat on the mat
|
22
|
+
<script type="text/javascript" charset="utf-8">
|
23
|
+
function nofunction(){var bob;}
|
24
|
+
</script>
|
25
|
+
a short line
|
26
|
+
HTML
|
27
|
+
result = Jkl::Text::remove_short_lines input
|
28
|
+
assert result == "the cat sat on the mat"
|
29
|
+
end
|
30
|
+
|
31
|
+
should "Remove html comments" do
|
32
|
+
input = <<-HTML
|
33
|
+
the cat sat on the mat
|
34
|
+
<!-- a comment-->
|
35
|
+
a short line
|
36
|
+
HTML
|
37
|
+
result = Jkl::Text::remove_short_lines input
|
38
|
+
assert result == "the cat sat on the mat"
|
39
|
+
end
|
40
|
+
|
41
|
+
should "Remove blank lines" do
|
42
|
+
input = <<-HTML
|
43
|
+
the cat sat on the mat
|
44
|
+
|
45
|
+
a short line
|
46
|
+
HTML
|
47
|
+
result = Jkl::Text::remove_short_lines input
|
48
|
+
assert result == "the cat sat on the mat"
|
49
|
+
end
|
50
|
+
|
51
|
+
should "Strip all tags" do
|
52
|
+
input = <<-HTML
|
53
|
+
<p>the cat sat on the mat</p>
|
54
|
+
HTML
|
55
|
+
result = Jkl::Text::strip_all_tags input
|
56
|
+
assert result == "the cat sat on the mat\n" #TODO fix carriage return
|
57
|
+
end
|
58
|
+
|
59
|
+
should "Clean text" do
|
60
|
+
input = <<-HTML
|
61
|
+
the cat sat on the mat
|
62
|
+
<script type="text/javascript" charset="utf-8">
|
63
|
+
function nofunction(){var bob;}
|
64
|
+
</script>
|
65
|
+
<p> some para stuff here </p>
|
66
|
+
some end stuff here
|
67
|
+
HTML
|
68
|
+
result = Jkl::Text::clean input
|
69
|
+
assert result == "the cat sat on the mat"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 7
|
9
|
+
version: 0.1.7
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- sshingler
|
@@ -73,7 +73,7 @@ dependencies:
|
|
73
73
|
version: 0.0.9
|
74
74
|
type: :runtime
|
75
75
|
version_requirements: *id004
|
76
|
-
description: Jakal is a Ruby library which contains some utilities for connecting to internet based APIs.
|
76
|
+
description: Jakal is a Ruby library which contains some utilities for connecting to internet based APIs and cleaning text.
|
77
77
|
email: "'shingler@gmail.com'"
|
78
78
|
executables: []
|
79
79
|
|
@@ -88,20 +88,13 @@ files:
|
|
88
88
|
- lib/jkl/rest_client.rb
|
89
89
|
- lib/jkl/rss_client.rb
|
90
90
|
- lib/jkl/text_client.rb
|
91
|
-
-
|
92
|
-
-
|
93
|
-
-
|
94
|
-
-
|
95
|
-
- features/mocks/calais.json
|
96
|
-
- features/mocks/topix_rss.xml
|
97
|
-
- features/step_definitions/calais_steps.rb
|
98
|
-
- features/step_definitions/http_steps.rb
|
99
|
-
- features/step_definitions/sanitize-text_steps.rb
|
100
|
-
- features/step_definitions/twitter_steps.rb
|
101
|
-
- features/support/env.rb
|
91
|
+
- test/fixtures/bbc_story.html
|
92
|
+
- test/fixtures/topix_rss.xml
|
93
|
+
- test/unit/jkl_test.rb
|
94
|
+
- test/unit/text_cleaning_test.rb
|
102
95
|
- README.md
|
103
96
|
- License.txt
|
104
|
-
has_rdoc:
|
97
|
+
has_rdoc: true
|
105
98
|
homepage: http://github.com/sshingler/jkl
|
106
99
|
licenses: []
|
107
100
|
|
@@ -131,6 +124,6 @@ rubyforge_project:
|
|
131
124
|
rubygems_version: 1.3.6
|
132
125
|
signing_key:
|
133
126
|
specification_version: 2
|
134
|
-
summary: Jakal is a Ruby library which contains some utilities for connecting to internet based APIs.
|
127
|
+
summary: Jakal is a Ruby library which contains some utilities for connecting to internet based APIs and cleaning text.
|
135
128
|
test_files: []
|
136
129
|
|
data/features/calais.feature
DELETED
@@ -1,10 +0,0 @@
|
|
1
|
-
Feature: Calais-Specific features
|
2
|
-
In order to use the Calais Meta-Tagging Service
|
3
|
-
As a developer
|
4
|
-
I want to make some requests and inspect some responses
|
5
|
-
|
6
|
-
@live
|
7
|
-
Scenario: Get nested tags from calais
|
8
|
-
Given I have some text
|
9
|
-
When I request the nested entities from calais
|
10
|
-
Then I should receive the entities grouped into categories
|
data/features/http.feature
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
Feature: http features
|
2
|
-
In order to use the utility client calls
|
3
|
-
As a developer
|
4
|
-
I want to make some requests and inspect some responses
|
5
|
-
|
6
|
-
@live
|
7
|
-
Scenario: Make a restful post to yahoo
|
8
|
-
When I post some data to yahoo
|
9
|
-
Then I should get a response
|
10
|
-
|
11
|
-
@live
|
12
|
-
Scenario: Make a restful get
|
13
|
-
When I make a restful get request
|
14
|
-
Then I should get a response
|
15
|
-
And I should see some text
|
16
|
-
|
17
|
-
@live
|
18
|
-
Scenario: Get some trends
|
19
|
-
When I request some twitter trends
|
20
|
-
Then I should get a response
|