jakal 0.1.96 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ *.gem
2
+ keys.yml
3
+ coverage
4
+ .bundle
5
+ webrat.log
6
+ ._*
7
+ .DS*
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source :gemcutter
2
+
3
+ gemspec
4
+
5
+ group :development do
6
+ gem "ruby-debug19", "0.11.6", :require => nil
7
+ end
8
+
9
+ group :test do
10
+ gem "test-unit", "2.3.0", :require => "test/unit"
11
+ gem "shoulda", "2.11.3"
12
+ gem "webmock", "1.6.2"
13
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,57 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ jakal (0.2.0)
5
+ calais (>= 0.0.11)
6
+ mechanize (>= 1.0.0)
7
+ nokogiri (~> 1.4.4)
8
+ rake (>= 0.8.7)
9
+
10
+ GEM
11
+ remote: http://rubygems.org/
12
+ specs:
13
+ addressable (2.2.6)
14
+ archive-tar-minitar (0.5.2)
15
+ calais (0.0.11)
16
+ curb (>= 0.1.4)
17
+ json (>= 1.1.3)
18
+ nokogiri (>= 1.3.3)
19
+ columnize (0.3.2)
20
+ crack (0.1.8)
21
+ curb (0.7.15)
22
+ json (1.5.1)
23
+ linecache19 (0.5.12)
24
+ ruby_core_source (>= 0.1.4)
25
+ mechanize (1.0.0)
26
+ nokogiri (>= 1.2.1)
27
+ nokogiri (1.4.4)
28
+ rake (0.8.7)
29
+ ruby-debug-base19 (0.11.25)
30
+ columnize (>= 0.3.1)
31
+ linecache19 (>= 0.5.11)
32
+ ruby_core_source (>= 0.1.4)
33
+ ruby-debug19 (0.11.6)
34
+ columnize (>= 0.3.1)
35
+ linecache19 (>= 0.5.11)
36
+ ruby-debug-base19 (>= 0.11.19)
37
+ ruby_core_source (0.1.5)
38
+ archive-tar-minitar (>= 0.5.2)
39
+ shoulda (2.11.3)
40
+ test-unit (2.3.0)
41
+ webmock (1.6.2)
42
+ addressable (>= 2.2.2)
43
+ crack (>= 0.1.7)
44
+
45
+ PLATFORMS
46
+ ruby
47
+
48
+ DEPENDENCIES
49
+ calais (>= 0.0.11)
50
+ jakal!
51
+ mechanize (>= 1.0.0)
52
+ nokogiri (~> 1.4.4)
53
+ rake (>= 0.8.7)
54
+ ruby-debug19 (= 0.11.6)
55
+ shoulda (= 2.11.3)
56
+ test-unit (= 2.3.0)
57
+ webmock (= 1.6.2)
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require 'rubygems'
2
+ require 'rake/testtask'
3
+
4
+ desc "Run all the tests"
5
+ Rake::TestTask.new do |t|
6
+ t.test_files = FileList['test/unit/*.rb']
7
+ end
data/jkl.gemspec ADDED
@@ -0,0 +1,19 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{jakal}
3
+ s.version = "0.2.0"
4
+ s.platform = Gem::Platform::RUBY
5
+ s.authors = ['sshingler']
6
+ s.homepage = %q{http://github.com/sshingler/jkl}
7
+ s.description = %q{Jakal is a Ruby library for tagging keywords from web pages.}
8
+ s.summary = s.description
9
+ s.email = %q{'shingler@gmail.com'}
10
+
11
+ s.require_paths = ["lib"]
12
+ s.files = `git ls-files`.split("\n")
13
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
14
+
15
+ s.add_dependency(%q<rake>, [">= 0.8.7"])
16
+ s.add_dependency(%q<mechanize>, [">= 1.0.0"])
17
+ s.add_dependency(%q<nokogiri>, ["~> 1.4.4"])
18
+ s.add_dependency(%q<calais>, [">= 0.0.11"])
19
+ end
@@ -1,27 +1,15 @@
1
- require 'hpricot'
2
1
 
3
2
  module Jkl
4
3
  module Rss
5
4
  class << self
6
-
7
- def items(rss_doc)
8
- (rss_doc/:item)
5
+ def items(rss)
6
+ rss_doc = Nokogiri::Slop(rss)
7
+ rss_doc.rss.channel.item
9
8
  end
10
9
 
11
10
  def links(items)
12
- items.map{|item| attribute_from(item,:link)}
13
- end
14
-
15
- def descriptions(items)
16
- items.map do |item|
17
- attribute_from(item, :description).gsub("<![CDATA[","").gsub("]]>","")
18
- end
11
+ items.map{|item| item.link.inner_html}
19
12
  end
20
-
21
- def attribute_from(item, name)
22
- (item/name).inner_html
23
- end
24
-
25
13
  end
26
14
  end
27
15
  end
@@ -2,10 +2,9 @@ module Jkl
2
2
  module Text
3
3
  class << self
4
4
 
5
- def sanitize(text, words_on_line = 5)
6
- remove_short_lines(strip_all_tags(remove_script_tags(text)), words_on_line)
5
+ def plain_text(document, words_on_line = 5)
6
+ remove_short_lines(strip_all_tags(remove_script_tags(document)), words_on_line)
7
7
  end
8
- alias :clean :sanitize
9
8
 
10
9
  def strip_all_tags(text)
11
10
  text.gsub(/<\/?[^>]*>/, "")
data/lib/jkl.rb CHANGED
@@ -1,32 +1,22 @@
1
- $:.unshift(File.dirname(__FILE__)) unless
2
- $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
1
+ require_relative "jkl/rss_client"
2
+ require_relative "jkl/calais_client"
3
+ require_relative "jkl/text_client"
3
4
 
4
- require "jkl/rss_client"
5
- require "jkl/rest_client"
6
- require "jkl/calais_client"
7
- require "jkl/text_client"
5
+ require "mechanize"
8
6
 
9
7
  module Jkl
10
8
  class << self
11
9
 
10
+ def get(url)
11
+ agent = Mechanize.new
12
+ agent.get(url).body
13
+ end
14
+
12
15
  def links(feed)
13
- links = Jkl::Rss::links(Jkl::Rss::items(Jkl::get_xml_from(feed)))
16
+ links = Jkl::Rss::links(Jkl::Rss::items(Jkl::get(feed)))
14
17
  links.each do |link|
15
18
  yield link if block_given?
16
19
  end
17
20
  end
18
-
19
- def topix_links(keyphrase, url = "http://www.topix.net/rss/search/article?q=")
20
- links("#{url}#{keyphrase}")
21
- end
22
-
23
- def tags(key, link)
24
- text = Jkl::Text::sanitize(Jkl::get_from(link))
25
- Jkl::Extraction::tags(key, text)
26
- end
27
-
28
- def trends(url = "http://search.twitter.com/trends.json")
29
- JSON.parse(Jkl::get_from(url))["trends"].map{|t| t["name"]}
30
- end
31
21
  end
32
22
  end
@@ -2,78 +2,56 @@ require "test/unit"
2
2
  require "shoulda"
3
3
  require "webmock/test_unit"
4
4
  require "yaml"
5
+
5
6
  require_relative "../../lib/jkl"
6
7
 
7
8
  class JklTest < Test::Unit::TestCase
8
9
  include WebMock::API
9
10
 
10
- context "Using Jkl" do
11
+ context "for documents, plain text and tags" do
11
12
  setup do
12
- stub_twitter
13
- stub_topix
14
- stub_news_article
15
- end
16
-
17
- should "GET trends" do
18
- trends = Jkl::trends
19
- assert trends.length == 10
20
- assert trends[0] == "London"
13
+ @url = "http://www.bbc.co.uk"
14
+ response = File.read('test/fixtures/bbc_story.html')
15
+ stub_request(:get, @url).to_return(
16
+ :status => 200,
17
+ :body => response,
18
+ :headers => {'Content-Type' => 'text/html'})
21
19
  end
22
20
 
23
- should "GET news article URLS for a trend" do
24
- articles = Jkl::topix_links(Jkl::trends[0])
25
- assert articles.length == 2
26
- assert articles[0] = "http://www.localnews8.com/Global/story.asp?S=10876507"
21
+ should "Get a document from a URL" do
22
+ doc = Jkl::get(@url)
23
+ assert_not_nil doc
27
24
  end
28
-
29
- should "extract text from a news article" do
30
- articles = Jkl::topix_links(Jkl::trends[0])
31
- text = Jkl::Text::sanitize(Jkl::get_from(articles[0]))
32
- assert_not_nil text
25
+
26
+ should "Get the plain text version of a document" do
27
+ document = Jkl::get(@url)
28
+ text = Jkl::Text::plain_text(document,2)
29
+ assert_equal 8884, text.length
33
30
  end
34
-
35
- should "extract tags from some text" do
36
- keys = "config/keys.yml"
37
- raise "READ:::::::: You need to create #{keys} and put your calais credentials in it." unless File.exist?(keys)
38
- key = YAML::load_file(keys)['calais']
39
- text = <<-EOF
40
- Barack Obama said today that he expects there
41
- to be conflict within his new security team after
42
- confirming Hillary Clinton as his choice for US Secretary of State."
43
- EOF
44
- tags = Jkl::Extraction::tags(key, text)
45
- assert tags["Person"][0] == "Barack Obama"
31
+
32
+ should "Get the keywords from a document" do
33
+ document = Jkl::get(@url)
34
+ text = Jkl::Text::plain_text(document,2)
35
+ tags = Jkl::Extraction::tags(calais_key, text)
36
+ assert ! tags.empty?
46
37
  end
47
38
  end
48
39
 
49
- private
50
- def stub_twitter
51
- url = YAML::load_file('config/config.yml')['twitter']
52
- response = <<-EOF
53
- {"trends":[
54
- {"name":"London","url":"http://search.twitter.com/search?q=London"},
55
- {"name":"Geneva","url":"http://search.twitter.com/search?q=Geneva"},
56
- {"name":"Kabul","url":"http://search.twitter.com/search?q=Kabul"},
57
- {"name":"Chicago","url":"http://search.twitter.com/search?q=Chicago"},
58
- {"name":"Cannes","url":"http://search.twitter.com/search?q=Cannes"},
59
- {"name":"Verona","url":"http://search.twitter.com/search?q=Verona"},
60
- {"name":"Milan","url":"http://search.twitter.com/search?q=Milan"},
61
- {"name":"New York","url":"http://search.twitter.com/search?q=New%20York"},
62
- {"name":"Paris","url":"http://search.twitter.com/search?q=Paris"},
63
- {"name":"Melbourne","url":"http://search.twitter.com/search?q=Melbourne"}
64
- ],"as_of":"Sat, 1 Jan 1970 00:00:00 +0000"}
65
- EOF
66
- stub_request(:get, url).to_return(:body => response)
67
- end
68
- def stub_topix
69
- url = YAML::load_file('config/config.yml')['topix']
70
- response = File.read('test/fixtures/topix_rss.xml')
71
- stub_request(:get, "#{url}London").to_return(:body => response)
72
- end
73
- def stub_news_article
74
- response = File.read('test/fixtures/bbc_story.html')
75
- stub_request(:get, "http://www.localnews8.com/Global/story.asp?S=10876507").to_return(
76
- :body => response
77
- )
40
+ context "for RSS" do
41
+ should "Get links from a feed" do
42
+ feed = "http://feeds.bbci.co.uk/news/rss.xml"
43
+ response = File.read('test/fixtures/topix_rss.xml')
44
+ stub_request(:get, "http://feeds.bbci.co.uk/news/rss.xml").
45
+ to_return(:status => 200, :body => response, :headers => {})
46
+ first_link = "http://www.localnews8.com/Global/story.asp?S=10876507"
47
+ assert_equal first_link, Jkl::links(feed).first
48
+ end
78
49
  end
79
- end
50
+
51
+ private
52
+ def calais_key
53
+ keys = "config/keys.yml"
54
+ raise "READ:::::::: You need to create #{keys} and put your calais credentials in it." unless File.exist?(keys)
55
+ YAML::load_file(keys)['calais']
56
+ end
57
+ end
@@ -75,7 +75,7 @@ the cat sat on the mat
75
75
  <p> some para stuff here </p>
76
76
  some end stuff here
77
77
  HTML
78
- result = Jkl::Text::clean input
78
+ result = Jkl::Text::plain_text(input)
79
79
  assert result == "the cat sat on the mat"
80
80
  end
81
81
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 1
8
- - 96
9
- version: 0.1.96
7
+ - 2
8
+ - 0
9
+ version: 0.2.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - sshingler
@@ -14,11 +14,11 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-04-13 00:00:00 +00:00
17
+ date: 2011-05-28 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
- name: hpricot
21
+ name: rake
22
22
  prerelease: false
23
23
  requirement: &id001 !ruby/object:Gem::Requirement
24
24
  none: false
@@ -28,12 +28,12 @@ dependencies:
28
28
  segments:
29
29
  - 0
30
30
  - 8
31
- - 2
32
- version: 0.8.2
31
+ - 7
32
+ version: 0.8.7
33
33
  type: :runtime
34
34
  version_requirements: *id001
35
35
  - !ruby/object:Gem::Dependency
36
- name: json
36
+ name: mechanize
37
37
  prerelease: false
38
38
  requirement: &id002 !ruby/object:Gem::Requirement
39
39
  none: false
@@ -42,24 +42,24 @@ dependencies:
42
42
  - !ruby/object:Gem::Version
43
43
  segments:
44
44
  - 1
45
- - 2
46
- - 4
47
- version: 1.2.4
45
+ - 0
46
+ - 0
47
+ version: 1.0.0
48
48
  type: :runtime
49
49
  version_requirements: *id002
50
50
  - !ruby/object:Gem::Dependency
51
- name: rest-client
51
+ name: nokogiri
52
52
  prerelease: false
53
53
  requirement: &id003 !ruby/object:Gem::Requirement
54
54
  none: false
55
55
  requirements:
56
- - - ">="
56
+ - - ~>
57
57
  - !ruby/object:Gem::Version
58
58
  segments:
59
59
  - 1
60
60
  - 4
61
- - 2
62
- version: 1.4.2
61
+ - 4
62
+ version: 1.4.4
63
63
  type: :runtime
64
64
  version_requirements: *id003
65
65
  - !ruby/object:Gem::Dependency
@@ -73,40 +73,41 @@ dependencies:
73
73
  segments:
74
74
  - 0
75
75
  - 0
76
- - 9
77
- version: 0.0.9
76
+ - 11
77
+ version: 0.0.11
78
78
  type: :runtime
79
79
  version_requirements: *id004
80
- description: Jakal is a Ruby library which contains some utilities for tagging content, cleaning text from web pages and working with RSS feeds.
80
+ description: Jakal is a Ruby library for tagging keywords from web pages.
81
81
  email: "'shingler@gmail.com'"
82
82
  executables: []
83
83
 
84
84
  extensions: []
85
85
 
86
- extra_rdoc_files:
87
- - README.md
88
- - License.txt
86
+ extra_rdoc_files: []
87
+
89
88
  files:
89
+ - .gitignore
90
+ - Gemfile
91
+ - Gemfile.lock
92
+ - License.txt
93
+ - README.md
94
+ - Rakefile
95
+ - jkl.gemspec
90
96
  - lib/jkl.rb
91
97
  - lib/jkl/calais_client.rb
92
- - lib/jkl/rest_client.rb
93
98
  - lib/jkl/rss_client.rb
94
99
  - lib/jkl/text_client.rb
95
100
  - test/fixtures/bbc_story.html
96
101
  - test/fixtures/topix_rss.xml
97
102
  - test/unit/jkl_test.rb
98
103
  - test/unit/text_cleaning_test.rb
99
- - config/config.yml
100
- - README.md
101
- - License.txt
102
104
  has_rdoc: true
103
105
  homepage: http://github.com/sshingler/jkl
104
106
  licenses: []
105
107
 
106
108
  post_install_message:
107
- rdoc_options:
108
- - --inline-source
109
- - --charset=UTF-8
109
+ rdoc_options: []
110
+
110
111
  require_paths:
111
112
  - lib
112
113
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -130,7 +131,10 @@ requirements: []
130
131
  rubyforge_project:
131
132
  rubygems_version: 1.3.7
132
133
  signing_key:
133
- specification_version: 2
134
- summary: Jakal is a Ruby library which contains some utilities for tagging content, cleaning text from web pages and working with RSS feeds.
135
- test_files: []
136
-
134
+ specification_version: 3
135
+ summary: Jakal is a Ruby library for tagging keywords from web pages.
136
+ test_files:
137
+ - test/fixtures/bbc_story.html
138
+ - test/fixtures/topix_rss.xml
139
+ - test/unit/jkl_test.rb
140
+ - test/unit/text_cleaning_test.rb
data/config/config.yml DELETED
@@ -1,4 +0,0 @@
1
- topix:
2
- http://www.topix.net/rss/search/article?q=
3
- twitter:
4
- http://search.twitter.com/trends.json
@@ -1,49 +0,0 @@
1
- require 'net/http'
2
- require 'hpricot'
3
-
4
- module Jkl
5
- class << self
6
-
7
- def post_to(url, post_args = {})
8
- begin
9
- uri = URI.parse(url)
10
- resp, data = Net::HTTP.post_form(uri, post_args)
11
- data
12
- rescue URI::InvalidURIError => e
13
- puts("WARN: JKL Invalid URI: #{e}")
14
- rescue SocketError => e
15
- puts("WARN: JKL Could not connect: #{e}")
16
- rescue Errno::ECONNREFUSED => e
17
- puts("WARN: JKL Connection refused: #{e}")
18
- end
19
- end
20
-
21
- def get_from(uri)
22
- begin
23
- response = Net::HTTP.get_response(URI.parse(uri))
24
- response.body
25
- rescue URI::InvalidURIError => e
26
- puts("WARN: JKL Invalid URI: #{e}")
27
- rescue SocketError => e
28
- puts("WARN: JKL Could not connect: #{e}")
29
- rescue Errno::ECONNREFUSED => e
30
- puts("WARN: JKL Connection refused: #{e}")
31
- end
32
- end
33
-
34
- def get_from_over_https(host, path)
35
- http = Net::HTTP.new(host, "443")
36
- http.use_ssl = true
37
- http.get2(path) # returns [status, data]
38
- end
39
-
40
- def get_xml_from(uri)
41
- Hpricot.XML(get_from(uri))
42
- end
43
-
44
- def document_from(text)
45
- Hpricot(text)
46
- end
47
-
48
- end
49
- end