jakal 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,6 +11,10 @@ module Jkl
11
11
  )
12
12
  end
13
13
 
14
+ def entities(key,text)
15
+ calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
16
+ end
17
+
14
18
  def tags(key, text)
15
19
  nested_list = {}
16
20
  entities(key,text).each do |a|
@@ -19,10 +23,6 @@ module Jkl
19
23
  nested_list
20
24
  end
21
25
 
22
- def entities(key,text)
23
- calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
24
- end
25
-
26
26
  end
27
27
  end
28
28
  end
@@ -5,6 +5,7 @@ module Jkl
5
5
  def sanitize(text)
6
6
  remove_short_lines(strip_all_tags(remove_script_tags(text)))
7
7
  end
8
+ alias :clean :sanitize
8
9
 
9
10
  def strip_all_tags(text)
10
11
  text.gsub(/<\/?[^>]*>/, "")
data/lib/jkl.rb CHANGED
@@ -16,10 +16,19 @@ module Jkl
16
16
  end
17
17
  end
18
18
 
19
+ def topix_links(keyphrase)
20
+ url = YAML::load_file('config/config.yml')['topix']
21
+ links("#{url}#{keyphrase}")
22
+ end
23
+
19
24
  def tags(key, link)
20
25
  text = Jkl::Text::sanitize(Jkl::get_from(link))
21
26
  Jkl::Extraction::tags(key, text)
22
27
  end
23
28
 
29
+ def trends
30
+ url = YAML::load_file('config/config.yml')['twitter']
31
+ JSON.parse(Jkl::get_from(url))["trends"].map{|t| t["name"]}
32
+ end
24
33
  end
25
34
  end
File without changes
File without changes
@@ -0,0 +1,82 @@
1
+ require "test/unit"
2
+ require "shoulda"
3
+ require "webmock/test_unit"
4
+ require "yaml"
5
+ require "lib/jkl"
6
+
7
+ class JklTest < Test::Unit::TestCase
8
+ include WebMock
9
+
10
+ context "Using Jkl" do
11
+ setup do
12
+ stub_twitter
13
+ stub_topix
14
+ stub_news_article
15
+ end
16
+
17
+ should "GET trends" do
18
+ trends = Jkl::trends
19
+ assert trends.length == 10
20
+ assert trends[0] == "London"
21
+ end
22
+
23
+ should "GET news article URLS for a trend" do
24
+ articles = Jkl::topix_links(Jkl::trends[0])
25
+ assert articles.length == 2
26
+ assert articles[0] = "http://www.localnews8.com/Global/story.asp?S=10876507"
27
+ end
28
+
29
+ should "extract text from a news article" do
30
+ articles = Jkl::topix_links(Jkl::trends[0])
31
+ text = Jkl::Text::sanitize(Jkl::get_from(articles[0]))
32
+ assert_not_nil text
33
+ end
34
+
35
+ should "extract tags from some text" do
36
+ key = YAML::load_file('config/keys.yml')['calais']
37
+ text = <<-EOF
38
+ Barack Obama said today that he expects there
39
+ to be conflict within his new security team after
40
+ confirming Hillary Clinton as his choice for US Secretary of State."
41
+ EOF
42
+ tags = Jkl::Extraction::tags(key, text)
43
+ assert tags["Person"][0] == "Barack Obama"
44
+ puts Jkl::Extraction::entities(key,text)
45
+ end
46
+ end
47
+
48
+ private
49
+ def stub_twitter
50
+ url = YAML::load_file('config/config.yml')['twitter']
51
+ response = <<-EOF
52
+ {"trends":[
53
+ {"name":"London","url":"http://search.twitter.com/search?q=London"},
54
+ {"name":"Geneva","url":"http://search.twitter.com/search?q=Geneva"},
55
+ {"name":"Kabul","url":"http://search.twitter.com/search?q=Kabul"},
56
+ {"name":"Chicago","url":"http://search.twitter.com/search?q=Chicago"},
57
+ {"name":"Cannes","url":"http://search.twitter.com/search?q=Cannes"},
58
+ {"name":"Verona","url":"http://search.twitter.com/search?q=Verona"},
59
+ {"name":"Milan","url":"http://search.twitter.com/search?q=Milan"},
60
+ {"name":"New York","url":"http://search.twitter.com/search?q=New%20York"},
61
+ {"name":"Paris","url":"http://search.twitter.com/search?q=Paris"},
62
+ {"name":"Melbourne","url":"http://search.twitter.com/search?q=Melbourne"}
63
+ ],"as_of":"Sat, 1 Jan 1970 00:00:00 +0000"}
64
+ EOF
65
+ stub_request(:get, url).to_return(:body => response)
66
+ end
67
+ def stub_topix
68
+ url = YAML::load_file('config/config.yml')['topix']
69
+ response = raw = File.open('test/fixtures/topix_rss.xml','r') do |file|
70
+ file.readlines.to_s
71
+ end
72
+ stub_request(:get, "#{url}London").to_return(:body => response)
73
+ end
74
+ def stub_news_article
75
+ response = raw = File.open('test/fixtures/bbc_story.html','r') do |file|
76
+ file.readlines.to_s
77
+ end
78
+ stub_request(:get, "http://www.localnews8.com/Global/story.asp?S=10876507").to_return(
79
+ :body => response
80
+ )
81
+ end
82
+ end
@@ -0,0 +1,72 @@
1
+ require "test/unit"
2
+ require "shoulda"
3
+ require "webmock/test_unit"
4
+ require "yaml"
5
+ require "lib/jkl"
6
+
7
+ class TextCleaningTest < Test::Unit::TestCase
8
+ context "Cleaning Text" do
9
+
10
+ should "Remove short lines" do
11
+ input = <<-HTML
12
+ the cat sat on the mat
13
+ a short line
14
+ HTML
15
+ result = Jkl::Text::remove_short_lines input
16
+ assert result == "the cat sat on the mat"
17
+ end
18
+
19
+ should "Remove script tags" do
20
+ input = <<-HTML
21
+ the cat sat on the mat
22
+ <script type="text/javascript" charset="utf-8">
23
+ function nofunction(){var bob;}
24
+ </script>
25
+ a short line
26
+ HTML
27
+ result = Jkl::Text::remove_short_lines input
28
+ assert result == "the cat sat on the mat"
29
+ end
30
+
31
+ should "Remove html comments" do
32
+ input = <<-HTML
33
+ the cat sat on the mat
34
+ <!-- a comment-->
35
+ a short line
36
+ HTML
37
+ result = Jkl::Text::remove_short_lines input
38
+ assert result == "the cat sat on the mat"
39
+ end
40
+
41
+ should "Remove blank lines" do
42
+ input = <<-HTML
43
+ the cat sat on the mat
44
+
45
+ a short line
46
+ HTML
47
+ result = Jkl::Text::remove_short_lines input
48
+ assert result == "the cat sat on the mat"
49
+ end
50
+
51
+ should "Strip all tags" do
52
+ input = <<-HTML
53
+ <p>the cat sat on the mat</p>
54
+ HTML
55
+ result = Jkl::Text::strip_all_tags input
56
+ assert result == "the cat sat on the mat\n" #TODO fix carriage return
57
+ end
58
+
59
+ should "Clean text" do
60
+ input = <<-HTML
61
+ the cat sat on the mat
62
+ <script type="text/javascript" charset="utf-8">
63
+ function nofunction(){var bob;}
64
+ </script>
65
+ <p> some para stuff here </p>
66
+ some end stuff here
67
+ HTML
68
+ result = Jkl::Text::clean input
69
+ assert result == "the cat sat on the mat"
70
+ end
71
+ end
72
+ end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 6
9
- version: 0.1.6
8
+ - 7
9
+ version: 0.1.7
10
10
  platform: ruby
11
11
  authors:
12
12
  - sshingler
@@ -73,7 +73,7 @@ dependencies:
73
73
  version: 0.0.9
74
74
  type: :runtime
75
75
  version_requirements: *id004
76
- description: Jakal is a Ruby library which contains some utilities for connecting to internet based APIs.
76
+ description: Jakal is a Ruby library which contains some utilities for connecting to internet based APIs and cleaning text.
77
77
  email: "'shingler@gmail.com'"
78
78
  executables: []
79
79
 
@@ -88,20 +88,13 @@ files:
88
88
  - lib/jkl/rest_client.rb
89
89
  - lib/jkl/rss_client.rb
90
90
  - lib/jkl/text_client.rb
91
- - features/calais.feature
92
- - features/http.feature
93
- - features/sanitize-text.feature
94
- - features/mocks/bbc_story.html
95
- - features/mocks/calais.json
96
- - features/mocks/topix_rss.xml
97
- - features/step_definitions/calais_steps.rb
98
- - features/step_definitions/http_steps.rb
99
- - features/step_definitions/sanitize-text_steps.rb
100
- - features/step_definitions/twitter_steps.rb
101
- - features/support/env.rb
91
+ - test/fixtures/bbc_story.html
92
+ - test/fixtures/topix_rss.xml
93
+ - test/unit/jkl_test.rb
94
+ - test/unit/text_cleaning_test.rb
102
95
  - README.md
103
96
  - License.txt
104
- has_rdoc: false
97
+ has_rdoc: true
105
98
  homepage: http://github.com/sshingler/jkl
106
99
  licenses: []
107
100
 
@@ -131,6 +124,6 @@ rubyforge_project:
131
124
  rubygems_version: 1.3.6
132
125
  signing_key:
133
126
  specification_version: 2
134
- summary: Jakal is a Ruby library which contains some utilities for connecting to internet based APIs.
127
+ summary: Jakal is a Ruby library which contains some utilities for connecting to internet based APIs and cleaning text.
135
128
  test_files: []
136
129
 
@@ -1,10 +0,0 @@
1
- Feature: Calais-Specific features
2
- In order to use the Calais Meta-Tagging Service
3
- As a developer
4
- I want to make some requests and inspect some responses
5
-
6
- @live
7
- Scenario: Get nested tags from calais
8
- Given I have some text
9
- When I request the nested entities from calais
10
- Then I should receive the entities grouped into categories
@@ -1,20 +0,0 @@
1
- Feature: http features
2
- In order to use the utility client calls
3
- As a developer
4
- I want to make some requests and inspect some responses
5
-
6
- @live
7
- Scenario: Make a restful post to yahoo
8
- When I post some data to yahoo
9
- Then I should get a response
10
-
11
- @live
12
- Scenario: Make a restful get
13
- When I make a restful get request
14
- Then I should get a response
15
- And I should see some text
16
-
17
- @live
18
- Scenario: Get some trends
19
- When I request some twitter trends
20
- Then I should get a response