jakal 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,30 +2,25 @@ PATH
2
2
  remote: .
3
3
  specs:
4
4
  jakal (0.2.0)
5
- calais (>= 0.0.11)
6
5
  mechanize (>= 1.0.0)
7
6
  nokogiri (~> 1.4.4)
8
7
  rake (>= 0.8.7)
8
+ term-extract (~> 0.5.1)
9
9
 
10
10
  GEM
11
11
  remote: http://rubygems.org/
12
12
  specs:
13
13
  addressable (2.2.6)
14
14
  archive-tar-minitar (0.5.2)
15
- calais (0.0.11)
16
- curb (>= 0.1.4)
17
- json (>= 1.1.3)
18
- nokogiri (>= 1.3.3)
19
15
  columnize (0.3.2)
20
16
  crack (0.1.8)
21
- curb (0.7.15)
22
- json (1.5.1)
23
17
  linecache19 (0.5.12)
24
18
  ruby_core_source (>= 0.1.4)
25
19
  mechanize (1.0.0)
26
20
  nokogiri (>= 1.2.1)
27
21
  nokogiri (1.4.4)
28
22
  rake (0.8.7)
23
+ rbtagger (0.4.7)
29
24
  ruby-debug-base19 (0.11.25)
30
25
  columnize (>= 0.3.1)
31
26
  linecache19 (>= 0.5.11)
@@ -37,6 +32,8 @@ GEM
37
32
  ruby_core_source (0.1.5)
38
33
  archive-tar-minitar (>= 0.5.2)
39
34
  shoulda (2.11.3)
35
+ term-extract (0.5.1)
36
+ rbtagger
40
37
  test-unit (2.3.0)
41
38
  webmock (1.6.2)
42
39
  addressable (>= 2.2.2)
@@ -46,12 +43,12 @@ PLATFORMS
46
43
  ruby
47
44
 
48
45
  DEPENDENCIES
49
- calais (>= 0.0.11)
50
46
  jakal!
51
47
  mechanize (>= 1.0.0)
52
48
  nokogiri (~> 1.4.4)
53
49
  rake (>= 0.8.7)
54
50
  ruby-debug19 (= 0.11.6)
55
51
  shoulda (= 2.11.3)
52
+ term-extract (~> 0.5.1)
56
53
  test-unit (= 2.3.0)
57
54
  webmock (= 1.6.2)
data/README.md CHANGED
@@ -1,59 +1,7 @@
1
1
  # jkl
2
2
 
3
- jkl (Jakal) does these things:
4
-
5
- * Connects to URLs.
6
- * Gets stuff out of RSS feeds.
7
- * Gets the main content from web pages
8
- * Gets a set of metadata from a web page (using the calais gem)
9
-
10
- # Sample usage
11
-
12
- For example - if you had a RSS feed:
13
-
14
- require "jkl"
15
-
16
- feed = "http://www.topix.net/rss/search/article?x=0&y=0&q=London"
17
-
18
- You could collect some metadata from the links in that feed, thus:
19
-
20
- tags = []
21
- Jkl::links(feed).each do |link|
22
- tags << Jkl::tags("my_calais_key",link)
23
- end
24
-
25
- A metadata sample might look something like this:
26
-
27
- {
28
- "Person"=>["Barack Obama", "Hillary Clinton"],
29
- "Position"=>["Secretary of State"]
30
- }
3
+ Jakal is a Ruby library for tagging keywords from web pages.
31
4
 
32
5
  It is hosted at [gemcutter](http://gemcutter.org/gems/jakal)
33
6
 
34
- gem install jakal
35
-
36
- # LICENSE:
37
-
38
- (The MIT License)
39
-
40
- Copyright (c) 2009 sshingler
41
-
42
- Permission is hereby granted, free of charge, to any person obtaining
43
- a copy of this software and associated documentation files (the
44
- 'Software'), to deal in the Software without restriction, including
45
- without limitation the rights to use, copy, modify, merge, publish,
46
- distribute, sublicense, and/or sell copies of the Software, and to
47
- permit persons to whom the Software is furnished to do so, subject to
48
- the following conditions:
49
-
50
- The above copyright notice and this permission notice shall be
51
- included in all copies or substantial portions of the Software.
52
-
53
- THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
54
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
55
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
56
- IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
57
- CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
58
- TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
59
- SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
7
+ gem install jakal
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'convert_to_should_syntax' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('shoulda', 'convert_to_should_syntax')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'edit_json.rb' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('json', 'edit_json.rb')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'minitar' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('archive-tar-minitar', 'minitar')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'nokogiri' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('nokogiri', 'nokogiri')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'prettify_json.rb' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('json', 'prettify_json.rb')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'rake' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('rake', 'rake')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'rdebug' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('ruby-debug19', 'rdebug')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'term-extract' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('term-extract', 'term-extract')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'testrb' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('test-unit', 'testrb')
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = %q{jakal}
3
- s.version = "0.2.0"
3
+ s.version = "0.2.1"
4
4
  s.platform = Gem::Platform::RUBY
5
5
  s.authors = ['sshingler']
6
6
  s.homepage = %q{http://github.com/sshingler/jkl}
@@ -15,5 +15,5 @@ Gem::Specification.new do |s|
15
15
  s.add_dependency(%q<rake>, [">= 0.8.7"])
16
16
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
17
17
  s.add_dependency(%q<nokogiri>, ["~> 1.4.4"])
18
- s.add_dependency(%q<calais>, [">= 0.0.11"])
18
+ s.add_dependency(%q<term-extract>, ["~> 0.5.1"])
19
19
  end
data/lib/jkl.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  require_relative "jkl/rss_client"
2
- require_relative "jkl/calais_client"
3
2
  require_relative "jkl/text_client"
4
3
 
5
4
  require "mechanize"
5
+ require "term-extract"
6
6
 
7
7
  module Jkl
8
8
  class << self
@@ -18,5 +18,9 @@ module Jkl
18
18
  yield link if block_given?
19
19
  end
20
20
  end
21
+
22
+ def tags(text)
23
+ TermExtract.extract(text.force_encoding("UTF-8")).keys
24
+ end
21
25
  end
22
26
  end
@@ -3,8 +3,12 @@ module Jkl
3
3
  class << self
4
4
 
5
5
  def plain_text(document, words_on_line = 5)
6
- remove_short_lines(strip_all_tags(remove_script_tags(document)), words_on_line)
6
+ CGI::unescapeHTML(
7
+ remove_short_lines(
8
+ strip_all_tags(
9
+ remove_script_tags(document)), words_on_line))
7
10
  end
11
+ alias :sanitize :plain_text
8
12
 
9
13
  def strip_all_tags(text)
10
14
  text.gsub(/<\/?[^>]*>/, "")
@@ -26,13 +30,11 @@ module Jkl
26
30
  def remove_short_lines(text, words_on_line = 5)
27
31
  text = text.gsub(/\s\s/, "\n")
28
32
  str = ""
29
- # remove short lines - ususally just navigation
30
- text.split("\n").each do |l|
31
- str << l unless l.count(" ") < words_on_line
33
+ text.split("\n").each do |line|
34
+ str << "#{line}\n" unless line.count(" ") < words_on_line
32
35
  end
33
36
  str
34
37
  end
35
-
36
38
  end
37
39
  end
38
40
  end
@@ -2,13 +2,12 @@ require "test/unit"
2
2
  require "shoulda"
3
3
  require "webmock/test_unit"
4
4
  require "yaml"
5
-
6
5
  require_relative "../../lib/jkl"
7
6
 
8
7
  class JklTest < Test::Unit::TestCase
9
8
  include WebMock::API
10
9
 
11
- context "for documents, plain text and tags" do
10
+ context "Jkl: When handling documents, plain text and tags" do
12
11
  setup do
13
12
  @url = "http://www.bbc.co.uk"
14
13
  response = File.read('test/fixtures/bbc_story.html')
@@ -18,27 +17,27 @@ class JklTest < Test::Unit::TestCase
18
17
  :headers => {'Content-Type' => 'text/html'})
19
18
  end
20
19
 
21
- should "Get a document from a URL" do
20
+ should "get a document from a URL" do
22
21
  doc = Jkl::get(@url)
23
22
  assert_not_nil doc
24
23
  end
25
24
 
26
- should "Get the plain text version of a document" do
25
+ should "get the plain text version of a document" do
27
26
  document = Jkl::get(@url)
28
27
  text = Jkl::Text::plain_text(document,2)
29
- assert_equal 8884, text.length
28
+ assert_equal 9009, text.length
30
29
  end
31
30
 
32
- should "Get the keywords from a document" do
31
+ should "get the keywords from a document" do
33
32
  document = Jkl::get(@url)
34
33
  text = Jkl::Text::plain_text(document,2)
35
- tags = Jkl::Extraction::tags(calais_key, text)
34
+ tags = Jkl::tags(text)
36
35
  assert ! tags.empty?
37
36
  end
38
37
  end
39
38
 
40
- context "for RSS" do
41
- should "Get links from a feed" do
39
+ context "Jkl: When handling RSS" do
40
+ should "get links from a feed" do
42
41
  feed = "http://feeds.bbci.co.uk/news/rss.xml"
43
42
  response = File.read('test/fixtures/topix_rss.xml')
44
43
  stub_request(:get, "http://feeds.bbci.co.uk/news/rss.xml").
@@ -13,7 +13,7 @@ the cat sat on the mat
13
13
  a short line
14
14
  HTML
15
15
  result = Jkl::Text::remove_short_lines input
16
- assert result == "the cat sat on the mat"
16
+ assert result == "the cat sat on the mat\n"
17
17
  end
18
18
 
19
19
  should "Remove shorter lines" do
@@ -23,7 +23,8 @@ the cat sat on the slightly fluffy, yet worn and homely mat
23
23
  a short line
24
24
  HTML
25
25
  result = Jkl::Text::remove_short_lines(input, 8)
26
- assert result == "the cat sat on the slightly fluffy, yet worn and homely mat"
26
+ expected = "the cat sat on the slightly fluffy, yet worn and homely mat\n"
27
+ assert_equal expected, result
27
28
  end
28
29
 
29
30
  should "Remove script tags" do
@@ -35,7 +36,7 @@ function nofunction(){var bob;}
35
36
  a short line
36
37
  HTML
37
38
  result = Jkl::Text::remove_short_lines input
38
- assert result == "the cat sat on the mat"
39
+ assert result == "the cat sat on the mat\n"
39
40
  end
40
41
 
41
42
  should "Remove html comments" do
@@ -45,7 +46,7 @@ the cat sat on the mat
45
46
  a short line
46
47
  HTML
47
48
  result = Jkl::Text::remove_short_lines input
48
- assert result == "the cat sat on the mat"
49
+ assert result == "the cat sat on the mat\n"
49
50
  end
50
51
 
51
52
  should "Remove blank lines" do
@@ -55,7 +56,7 @@ the cat sat on the mat
55
56
  a short line
56
57
  HTML
57
58
  result = Jkl::Text::remove_short_lines input
58
- assert result == "the cat sat on the mat"
59
+ assert result == "the cat sat on the mat\n"
59
60
  end
60
61
 
61
62
  should "Strip all tags" do
@@ -63,7 +64,7 @@ HTML
63
64
  <p>the cat sat on the mat</p>
64
65
  HTML
65
66
  result = Jkl::Text::strip_all_tags input
66
- assert result == "the cat sat on the mat\n" #TODO fix carriage return
67
+ assert result == "the cat sat on the mat\n"
67
68
  end
68
69
 
69
70
  should "Clean text" do
@@ -76,7 +77,19 @@ the cat sat on the mat
76
77
  some end stuff here
77
78
  HTML
78
79
  result = Jkl::Text::plain_text(input)
79
- assert result == "the cat sat on the mat"
80
+ assert result == "the cat sat on the mat\n"
81
+ end
82
+
83
+ should "Remove HTML escaped characters" do
84
+ input = <<HTML
85
+ Testing, testing, one two three.
86
+ <p><strong>The cat didn&#39;t sit on the mat</strong></p>
87
+ HTML
88
+ expected = <<EXPECTED
89
+ Testing, testing, one two three.
90
+ The cat didn't sit on the mat
91
+ EXPECTED
92
+ assert_equal expected, Jkl::Text::plain_text(input, 2)
80
93
  end
81
94
  end
82
95
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 2
8
- - 0
9
- version: 0.2.0
8
+ - 1
9
+ version: 0.2.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - sshingler
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-28 00:00:00 +00:00
17
+ date: 2011-06-02 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -63,18 +63,18 @@ dependencies:
63
63
  type: :runtime
64
64
  version_requirements: *id003
65
65
  - !ruby/object:Gem::Dependency
66
- name: calais
66
+ name: term-extract
67
67
  prerelease: false
68
68
  requirement: &id004 !ruby/object:Gem::Requirement
69
69
  none: false
70
70
  requirements:
71
- - - ">="
71
+ - - ~>
72
72
  - !ruby/object:Gem::Version
73
73
  segments:
74
74
  - 0
75
- - 0
76
- - 11
77
- version: 0.0.11
75
+ - 5
76
+ - 1
77
+ version: 0.5.1
78
78
  type: :runtime
79
79
  version_requirements: *id004
80
80
  description: Jakal is a Ruby library for tagging keywords from web pages.
@@ -92,9 +92,17 @@ files:
92
92
  - License.txt
93
93
  - README.md
94
94
  - Rakefile
95
+ - bin/convert_to_should_syntax
96
+ - bin/edit_json.rb
97
+ - bin/minitar
98
+ - bin/nokogiri
99
+ - bin/prettify_json.rb
100
+ - bin/rake
101
+ - bin/rdebug
102
+ - bin/term-extract
103
+ - bin/testrb
95
104
  - jkl.gemspec
96
105
  - lib/jkl.rb
97
- - lib/jkl/calais_client.rb
98
106
  - lib/jkl/rss_client.rb
99
107
  - lib/jkl/text_client.rb
100
108
  - test/fixtures/bbc_story.html
@@ -1,28 +0,0 @@
1
- require "calais"
2
-
3
- module Jkl
4
- module Extraction
5
- class << self
6
-
7
- def calais_response(key, text)
8
- Calais.process_document(
9
- :content => text,
10
- :license_id => key
11
- )
12
- end
13
-
14
- def entities(key,text)
15
- calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
16
- end
17
-
18
- def tags(key, text)
19
- nested_list = {}
20
- entities(key,text).each do |a|
21
- nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
22
- end
23
- nested_list
24
- end
25
-
26
- end
27
- end
28
- end