jakal 0.1.96 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ *.gem
2
+ keys.yml
3
+ coverage
4
+ .bundle
5
+ webrat.log
6
+ ._*
7
+ .DS*
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source :gemcutter
2
+
3
+ gemspec
4
+
5
+ group :development do
6
+ gem "ruby-debug19", "0.11.6", :require => nil
7
+ end
8
+
9
+ group :test do
10
+ gem "test-unit", "2.3.0", :require => "test/unit"
11
+ gem "shoulda", "2.11.3"
12
+ gem "webmock", "1.6.2"
13
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,57 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ jakal (0.2.0)
5
+ calais (>= 0.0.11)
6
+ mechanize (>= 1.0.0)
7
+ nokogiri (~> 1.4.4)
8
+ rake (>= 0.8.7)
9
+
10
+ GEM
11
+ remote: http://rubygems.org/
12
+ specs:
13
+ addressable (2.2.6)
14
+ archive-tar-minitar (0.5.2)
15
+ calais (0.0.11)
16
+ curb (>= 0.1.4)
17
+ json (>= 1.1.3)
18
+ nokogiri (>= 1.3.3)
19
+ columnize (0.3.2)
20
+ crack (0.1.8)
21
+ curb (0.7.15)
22
+ json (1.5.1)
23
+ linecache19 (0.5.12)
24
+ ruby_core_source (>= 0.1.4)
25
+ mechanize (1.0.0)
26
+ nokogiri (>= 1.2.1)
27
+ nokogiri (1.4.4)
28
+ rake (0.8.7)
29
+ ruby-debug-base19 (0.11.25)
30
+ columnize (>= 0.3.1)
31
+ linecache19 (>= 0.5.11)
32
+ ruby_core_source (>= 0.1.4)
33
+ ruby-debug19 (0.11.6)
34
+ columnize (>= 0.3.1)
35
+ linecache19 (>= 0.5.11)
36
+ ruby-debug-base19 (>= 0.11.19)
37
+ ruby_core_source (0.1.5)
38
+ archive-tar-minitar (>= 0.5.2)
39
+ shoulda (2.11.3)
40
+ test-unit (2.3.0)
41
+ webmock (1.6.2)
42
+ addressable (>= 2.2.2)
43
+ crack (>= 0.1.7)
44
+
45
+ PLATFORMS
46
+ ruby
47
+
48
+ DEPENDENCIES
49
+ calais (>= 0.0.11)
50
+ jakal!
51
+ mechanize (>= 1.0.0)
52
+ nokogiri (~> 1.4.4)
53
+ rake (>= 0.8.7)
54
+ ruby-debug19 (= 0.11.6)
55
+ shoulda (= 2.11.3)
56
+ test-unit (= 2.3.0)
57
+ webmock (= 1.6.2)
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'rake/testtask'
3
+
4
+ desc "Run all the tests"
5
+ Rake::TestTask.new do |t|
6
+ t.test_files = FileList['test/unit/*.rb']
7
+ end
data/jkl.gemspec ADDED
@@ -0,0 +1,19 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{jakal}
3
+ s.version = "0.2.0"
4
+ s.platform = Gem::Platform::RUBY
5
+ s.authors = ['sshingler']
6
+ s.homepage = %q{http://github.com/sshingler/jkl}
7
+ s.description = %q{Jakal is a Ruby library for tagging keywords from web pages.}
8
+ s.summary = s.description
9
+ s.email = %q{'shingler@gmail.com'}
10
+
11
+ s.require_paths = ["lib"]
12
+ s.files = `git ls-files`.split("\n")
13
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
14
+
15
+ s.add_dependency(%q<rake>, [">= 0.8.7"])
16
+ s.add_dependency(%q<mechanize>, [">= 1.0.0"])
17
+ s.add_dependency(%q<nokogiri>, ["~> 1.4.4"])
18
+ s.add_dependency(%q<calais>, [">= 0.0.11"])
19
+ end
@@ -1,27 +1,15 @@
1
- require 'hpricot'
2
1
 
3
2
  module Jkl
4
3
  module Rss
5
4
  class << self
6
-
7
- def items(rss_doc)
8
- (rss_doc/:item)
5
+ def items(rss)
6
+ rss_doc = Nokogiri::Slop(rss)
7
+ rss_doc.rss.channel.item
9
8
  end
10
9
 
11
10
  def links(items)
12
- items.map{|item| attribute_from(item,:link)}
13
- end
14
-
15
- def descriptions(items)
16
- items.map do |item|
17
- attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
18
- end
11
+ items.map{|item| item.link.inner_html}
19
12
  end
20
-
21
- def attribute_from(item, name)
22
- (item/name).inner_html
23
- end
24
-
25
13
  end
26
14
  end
27
15
  end
@@ -2,10 +2,9 @@ module Jkl
2
2
  module Text
3
3
  class << self
4
4
 
5
- def sanitize(text, words_on_line = 5)
6
- remove_short_lines(strip_all_tags(remove_script_tags(text)), words_on_line)
5
+ def plain_text(document, words_on_line = 5)
6
+ remove_short_lines(strip_all_tags(remove_script_tags(document)), words_on_line)
7
7
  end
8
- alias :clean :sanitize
9
8
 
10
9
  def strip_all_tags(text)
11
10
  text.gsub(/<\/?[^>]*>/, "")
data/lib/jkl.rb CHANGED
@@ -1,32 +1,22 @@
1
- $:.unshift(File.dirname(__FILE__)) unless
2
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
1
+ require_relative "jkl/rss_client"
2
+ require_relative "jkl/calais_client"
3
+ require_relative "jkl/text_client"
3
4
 
4
- require "jkl/rss_client"
5
- require "jkl/rest_client"
6
- require "jkl/calais_client"
7
- require "jkl/text_client"
5
+ require "mechanize"
8
6
 
9
7
  module Jkl
10
8
  class << self
11
9
 
10
+ def get(url)
11
+ agent = Mechanize.new
12
+ agent.get(url).body
13
+ end
14
+
12
15
  def links(feed)
13
- links = Jkl::Rss::links(Jkl::Rss::items(Jkl::get_xml_from(feed)))
16
+ links = Jkl::Rss::links(Jkl::Rss::items(Jkl::get(feed)))
14
17
  links.each do |link|
15
18
  yield link if block_given?
16
19
  end
17
20
  end
18
-
19
- def topix_links(keyphrase, url = "http://www.topix.net/rss/search/article?q=")
20
- links("#{url}#{keyphrase}")
21
- end
22
-
23
- def tags(key, link)
24
- text = Jkl::Text::sanitize(Jkl::get_from(link))
25
- Jkl::Extraction::tags(key, text)
26
- end
27
-
28
- def trends(url = "http://search.twitter.com/trends.json")
29
- JSON.parse(Jkl::get_from(url))["trends"].map{|t| t["name"]}
30
- end
31
21
  end
32
22
  end
@@ -2,78 +2,56 @@ require "test/unit"
2
2
  require "shoulda"
3
3
  require "webmock/test_unit"
4
4
  require "yaml"
5
+
5
6
  require_relative "../../lib/jkl"
6
7
 
7
8
  class JklTest < Test::Unit::TestCase
8
9
  include WebMock::API
9
10
 
10
- context "Using Jkl" do
11
+ context "for documents, plain text and tags" do
11
12
  setup do
12
- stub_twitter
13
- stub_topix
14
- stub_news_article
15
- end
16
-
17
- should "GET trends" do
18
- trends = Jkl::trends
19
- assert trends.length == 10
20
- assert trends[0] == "London"
13
+ @url = "http://www.bbc.co.uk"
14
+ response = File.read('test/fixtures/bbc_story.html')
15
+ stub_request(:get, @url).to_return(
16
+ :status => 200,
17
+ :body => response,
18
+ :headers => {'Content-Type' => 'text/html'})
21
19
  end
22
20
 
23
- should "GET news article URLS for a trend" do
24
- articles = Jkl::topix_links(Jkl::trends[0])
25
- assert articles.length == 2
26
- assert articles[0] = "http://www.localnews8.com/Global/story.asp?S=10876507"
21
+ should "Get a document from a URL" do
22
+ doc = Jkl::get(@url)
23
+ assert_not_nil doc
27
24
  end
28
-
29
- should "extract text from a news article" do
30
- articles = Jkl::topix_links(Jkl::trends[0])
31
- text = Jkl::Text::sanitize(Jkl::get_from(articles[0]))
32
- assert_not_nil text
25
+
26
+ should "Get the plain text version of a document" do
27
+ document = Jkl::get(@url)
28
+ text = Jkl::Text::plain_text(document,2)
29
+ assert_equal 8884, text.length
33
30
  end
34
-
35
- should "extract tags from some text" do
36
- keys = "config/keys.yml"
37
- raise "READ:::::::: You need to create #{keys} and put your calais credentials in it." unless File.exist?(keys)
38
- key = YAML::load_file(keys)['calais']
39
- text = <<-EOF
40
- Barack Obama said today that he expects there
41
- to be conflict within his new security team after
42
- confirming Hillary Clinton as his choice for US Secretary of State."
43
- EOF
44
- tags = Jkl::Extraction::tags(key, text)
45
- assert tags["Person"][0] == "Barack Obama"
31
+
32
+ should "Get the keywords from a document" do
33
+ document = Jkl::get(@url)
34
+ text = Jkl::Text::plain_text(document,2)
35
+ tags = Jkl::Extraction::tags(calais_key, text)
36
+ assert ! tags.empty?
46
37
  end
47
38
  end
48
39
 
49
- private
50
- def stub_twitter
51
- url = YAML::load_file('config/config.yml')['twitter']
52
- response = <<-EOF
53
- {"trends":[
54
- {"name":"London","url":"http://search.twitter.com/search?q=London"},
55
- {"name":"Geneva","url":"http://search.twitter.com/search?q=Geneva"},
56
- {"name":"Kabul","url":"http://search.twitter.com/search?q=Kabul"},
57
- {"name":"Chicago","url":"http://search.twitter.com/search?q=Chicago"},
58
- {"name":"Cannes","url":"http://search.twitter.com/search?q=Cannes"},
59
- {"name":"Verona","url":"http://search.twitter.com/search?q=Verona"},
60
- {"name":"Milan","url":"http://search.twitter.com/search?q=Milan"},
61
- {"name":"New York","url":"http://search.twitter.com/search?q=New%20York"},
62
- {"name":"Paris","url":"http://search.twitter.com/search?q=Paris"},
63
- {"name":"Melbourne","url":"http://search.twitter.com/search?q=Melbourne"}
64
- ],"as_of":"Sat, 1 Jan 1970 00:00:00 +0000"}
65
- EOF
66
- stub_request(:get, url).to_return(:body => response)
67
- end
68
- def stub_topix
69
- url = YAML::load_file('config/config.yml')['topix']
70
- response = File.read('test/fixtures/topix_rss.xml')
71
- stub_request(:get, "#{url}London").to_return(:body => response)
72
- end
73
- def stub_news_article
74
- response = File.read('test/fixtures/bbc_story.html')
75
- stub_request(:get, "http://www.localnews8.com/Global/story.asp?S=10876507").to_return(
76
- :body => response
77
- )
40
+ context "for RSS" do
41
+ should "Get links from a feed" do
42
+ feed = "http://feeds.bbci.co.uk/news/rss.xml"
43
+ response = File.read('test/fixtures/topix_rss.xml')
44
+ stub_request(:get, "http://feeds.bbci.co.uk/news/rss.xml").
45
+ to_return(:status => 200, :body => response, :headers => {})
46
+ first_link = "http://www.localnews8.com/Global/story.asp?S=10876507"
47
+ assert_equal first_link, Jkl::links(feed).first
48
+ end
78
49
  end
79
- end
50
+
51
+ private
52
+ def calais_key
53
+ keys = "config/keys.yml"
54
+ raise "READ:::::::: You need to create #{keys} and put your calais credentials in it." unless File.exist?(keys)
55
+ YAML::load_file(keys)['calais']
56
+ end
57
+ end
@@ -75,7 +75,7 @@ the cat sat on the mat
75
75
  <p> some para stuff here </p>
76
76
  some end stuff here
77
77
  HTML
78
- result = Jkl::Text::clean input
78
+ result = Jkl::Text::plain_text(input)
79
79
  assert result == "the cat sat on the mat"
80
80
  end
81
81
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 96
9
- version: 0.1.96
7
+ - 2
8
+ - 0
9
+ version: 0.2.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - sshingler
@@ -14,11 +14,11 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-04-13 00:00:00 +00:00
17
+ date: 2011-05-28 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
- name: hpricot
21
+ name: rake
22
22
  prerelease: false
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
24
  none: false
@@ -28,12 +28,12 @@ dependencies:
28
28
  segments:
29
29
  - 0
30
30
  - 8
31
- - 2
32
- version: 0.8.2
31
+ - 7
32
+ version: 0.8.7
33
33
  type: :runtime
34
34
  version_requirements: *id001
35
35
  - !ruby/object:Gem::Dependency
36
- name: json
36
+ name: mechanize
37
37
  prerelease: false
38
38
  requirement: &id002 !ruby/object:Gem::Requirement
39
39
  none: false
@@ -42,24 +42,24 @@ dependencies:
42
42
  - !ruby/object:Gem::Version
43
43
  segments:
44
44
  - 1
45
- - 2
46
- - 4
47
- version: 1.2.4
45
+ - 0
46
+ - 0
47
+ version: 1.0.0
48
48
  type: :runtime
49
49
  version_requirements: *id002
50
50
  - !ruby/object:Gem::Dependency
51
- name: rest-client
51
+ name: nokogiri
52
52
  prerelease: false
53
53
  requirement: &id003 !ruby/object:Gem::Requirement
54
54
  none: false
55
55
  requirements:
56
- - - ">="
56
+ - - ~>
57
57
  - !ruby/object:Gem::Version
58
58
  segments:
59
59
  - 1
60
60
  - 4
61
- - 2
62
- version: 1.4.2
61
+ - 4
62
+ version: 1.4.4
63
63
  type: :runtime
64
64
  version_requirements: *id003
65
65
  - !ruby/object:Gem::Dependency
@@ -73,40 +73,41 @@ dependencies:
73
73
  segments:
74
74
  - 0
75
75
  - 0
76
- - 9
77
- version: 0.0.9
76
+ - 11
77
+ version: 0.0.11
78
78
  type: :runtime
79
79
  version_requirements: *id004
80
- description: Jakal is a Ruby library which contains some utilities for tagging content, cleaning text from web pages and working with RSS feeds.
80
+ description: Jakal is a Ruby library for tagging keywords from web pages.
81
81
  email: "'shingler@gmail.com'"
82
82
  executables: []
83
83
 
84
84
  extensions: []
85
85
 
86
- extra_rdoc_files:
87
- - README.md
88
- - License.txt
86
+ extra_rdoc_files: []
87
+
89
88
  files:
89
+ - .gitignore
90
+ - Gemfile
91
+ - Gemfile.lock
92
+ - License.txt
93
+ - README.md
94
+ - Rakefile
95
+ - jkl.gemspec
90
96
  - lib/jkl.rb
91
97
  - lib/jkl/calais_client.rb
92
- - lib/jkl/rest_client.rb
93
98
  - lib/jkl/rss_client.rb
94
99
  - lib/jkl/text_client.rb
95
100
  - test/fixtures/bbc_story.html
96
101
  - test/fixtures/topix_rss.xml
97
102
  - test/unit/jkl_test.rb
98
103
  - test/unit/text_cleaning_test.rb
99
- - config/config.yml
100
- - README.md
101
- - License.txt
102
104
  has_rdoc: true
103
105
  homepage: http://github.com/sshingler/jkl
104
106
  licenses: []
105
107
 
106
108
  post_install_message:
107
- rdoc_options:
108
- - --inline-source
109
- - --charset=UTF-8
109
+ rdoc_options: []
110
+
110
111
  require_paths:
111
112
  - lib
112
113
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -130,7 +131,10 @@ requirements: []
130
131
  rubyforge_project:
131
132
  rubygems_version: 1.3.7
132
133
  signing_key:
133
- specification_version: 2
134
- summary: Jakal is a Ruby library which contains some utilities for tagging content, cleaning text from web pages and working with RSS feeds.
135
- test_files: []
136
-
134
+ specification_version: 3
135
+ summary: Jakal is a Ruby library for tagging keywords from web pages.
136
+ test_files:
137
+ - test/fixtures/bbc_story.html
138
+ - test/fixtures/topix_rss.xml
139
+ - test/unit/jkl_test.rb
140
+ - test/unit/text_cleaning_test.rb
data/config/config.yml DELETED
@@ -1,4 +0,0 @@
1
- topix:
2
- http://www.topix.net/rss/search/article?q=
3
- twitter:
4
- http://search.twitter.com/trends.json
@@ -1,49 +0,0 @@
1
- require 'net/http'
2
- require 'hpricot'
3
-
4
- module Jkl
5
- class << self
6
-
7
- def post_to(url, post_args = {})
8
- begin
9
- uri = URI.parse(url)
10
- resp, data = Net::HTTP.post_form(uri, post_args)
11
- data
12
- rescue URI::InvalidURIError => e
13
- puts("WARN: JKL Invalid URI: #{e}")
14
- rescue SocketError => e
15
- puts("WARN: JKL Could not connect: #{e}")
16
- rescue Errno::ECONNREFUSED => e
17
- puts("WARN: JKL Connection refused: #{e}")
18
- end
19
- end
20
-
21
- def get_from(uri)
22
- begin
23
- response = Net::HTTP.get_response(URI.parse(uri))
24
- response.body
25
- rescue URI::InvalidURIError => e
26
- puts("WARN: JKL Invalid URI: #{e}")
27
- rescue SocketError => e
28
- puts("WARN: JKL Could not connect: #{e}")
29
- rescue Errno::ECONNREFUSED => e
30
- puts("WARN: JKL Connection refused: #{e}")
31
- end
32
- end
33
-
34
- def get_from_over_https(host, path)
35
- http = Net::HTTP.new(host, "443")
36
- http.use_ssl = true
37
- http.get2(path) # returns [status, data]
38
- end
39
-
40
- def get_xml_from(uri)
41
- Hpricot.XML(get_from(uri))
42
- end
43
-
44
- def document_from(text)
45
- Hpricot(text)
46
- end
47
-
48
- end
49
- end