jakal 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,30 +2,25 @@ PATH
2
2
  remote: .
3
3
  specs:
4
4
  jakal (0.2.0)
5
- calais (>= 0.0.11)
6
5
  mechanize (>= 1.0.0)
7
6
  nokogiri (~> 1.4.4)
8
7
  rake (>= 0.8.7)
8
+ term-extract (~> 0.5.1)
9
9
 
10
10
  GEM
11
11
  remote: http://rubygems.org/
12
12
  specs:
13
13
  addressable (2.2.6)
14
14
  archive-tar-minitar (0.5.2)
15
- calais (0.0.11)
16
- curb (>= 0.1.4)
17
- json (>= 1.1.3)
18
- nokogiri (>= 1.3.3)
19
15
  columnize (0.3.2)
20
16
  crack (0.1.8)
21
- curb (0.7.15)
22
- json (1.5.1)
23
17
  linecache19 (0.5.12)
24
18
  ruby_core_source (>= 0.1.4)
25
19
  mechanize (1.0.0)
26
20
  nokogiri (>= 1.2.1)
27
21
  nokogiri (1.4.4)
28
22
  rake (0.8.7)
23
+ rbtagger (0.4.7)
29
24
  ruby-debug-base19 (0.11.25)
30
25
  columnize (>= 0.3.1)
31
26
  linecache19 (>= 0.5.11)
@@ -37,6 +32,8 @@ GEM
37
32
  ruby_core_source (0.1.5)
38
33
  archive-tar-minitar (>= 0.5.2)
39
34
  shoulda (2.11.3)
35
+ term-extract (0.5.1)
36
+ rbtagger
40
37
  test-unit (2.3.0)
41
38
  webmock (1.6.2)
42
39
  addressable (>= 2.2.2)
@@ -46,12 +43,12 @@ PLATFORMS
46
43
  ruby
47
44
 
48
45
  DEPENDENCIES
49
- calais (>= 0.0.11)
50
46
  jakal!
51
47
  mechanize (>= 1.0.0)
52
48
  nokogiri (~> 1.4.4)
53
49
  rake (>= 0.8.7)
54
50
  ruby-debug19 (= 0.11.6)
55
51
  shoulda (= 2.11.3)
52
+ term-extract (~> 0.5.1)
56
53
  test-unit (= 2.3.0)
57
54
  webmock (= 1.6.2)
data/README.md CHANGED
@@ -1,59 +1,7 @@
1
1
  # jkl
2
2
 
3
- jkl (Jakal) does these things:
4
-
5
- * Connects to URLs.
6
- * Gets stuff out of RSS feeds.
7
- * Gets the main content from web pages
8
- * Gets a set of metadata from a web page (using the calais gem)
9
-
10
- # Sample usage
11
-
12
- For example - if you had a RSS feed:
13
-
14
- require "jkl"
15
-
16
- feed = "http://www.topix.net/rss/search/article?x=0&y=0&q=London"
17
-
18
- You could collect some metadata from the links in that feed, thus:
19
-
20
- tags = []
21
- Jkl::links(feed).each do |link|
22
- tags << Jkl::tags("my_calais_key",link)
23
- end
24
-
25
- A metadata sample might look something like this:
26
-
27
- {
28
- "Person"=>["Barack Obama", "Hillary Clinton"],
29
- "Position"=>["Secretary of State"]
30
- }
3
+ Jakal is a Ruby library for tagging keywords from web pages.
31
4
 
32
5
  It is hosted at [gemcutter](http://gemcutter.org/gems/jakal)
33
6
 
34
- gem install jakal
35
-
36
- # LICENSE:
37
-
38
- (The MIT License)
39
-
40
- Copyright (c) 2009 sshingler
41
-
42
- Permission is hereby granted, free of charge, to any person obtaining
43
- a copy of this software and associated documentation files (the
44
- 'Software'), to deal in the Software without restriction, including
45
- without limitation the rights to use, copy, modify, merge, publish,
46
- distribute, sublicense, and/or sell copies of the Software, and to
47
- permit persons to whom the Software is furnished to do so, subject to
48
- the following conditions:
49
-
50
- The above copyright notice and this permission notice shall be
51
- included in all copies or substantial portions of the Software.
52
-
53
- THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
54
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
55
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
56
- IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
57
- CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
58
- TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
59
- SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
7
+ gem install jakal
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'convert_to_should_syntax' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('shoulda', 'convert_to_should_syntax')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'edit_json.rb' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('json', 'edit_json.rb')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'minitar' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('archive-tar-minitar', 'minitar')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'nokogiri' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('nokogiri', 'nokogiri')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'prettify_json.rb' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('json', 'prettify_json.rb')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'rake' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('rake', 'rake')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'rdebug' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('ruby-debug19', 'rdebug')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'term-extract' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('term-extract', 'term-extract')
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file was generated by Bundler.
4
+ #
5
+ # The application 'testrb' is installed as part of a gem, and
6
+ # this file is here to facilitate running it.
7
+ #
8
+
9
+ ENV['BUNDLE_GEMFILE'] ||= File.expand_path("../../Gemfile", __FILE__)
10
+
11
+ require 'rubygems'
12
+ require 'bundler/setup'
13
+
14
+ load Gem.bin_path('test-unit', 'testrb')
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = %q{jakal}
3
- s.version = "0.2.0"
3
+ s.version = "0.2.1"
4
4
  s.platform = Gem::Platform::RUBY
5
5
  s.authors = ['sshingler']
6
6
  s.homepage = %q{http://github.com/sshingler/jkl}
@@ -15,5 +15,5 @@ Gem::Specification.new do |s|
15
15
  s.add_dependency(%q<rake>, [">= 0.8.7"])
16
16
  s.add_dependency(%q<mechanize>, [">= 1.0.0"])
17
17
  s.add_dependency(%q<nokogiri>, ["~> 1.4.4"])
18
- s.add_dependency(%q<calais>, [">= 0.0.11"])
18
+ s.add_dependency(%q<term-extract>, ["~> 0.5.1"])
19
19
  end
data/lib/jkl.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  require_relative "jkl/rss_client"
2
- require_relative "jkl/calais_client"
3
2
  require_relative "jkl/text_client"
4
3
 
5
4
  require "mechanize"
5
+ require "term-extract"
6
6
 
7
7
  module Jkl
8
8
  class << self
@@ -18,5 +18,9 @@ module Jkl
18
18
  yield link if block_given?
19
19
  end
20
20
  end
21
+
22
+ def tags(text)
23
+ TermExtract.extract(text.force_encoding("UTF-8")).keys
24
+ end
21
25
  end
22
26
  end
@@ -3,8 +3,12 @@ module Jkl
3
3
  class << self
4
4
 
5
5
  def plain_text(document, words_on_line = 5)
6
- remove_short_lines(strip_all_tags(remove_script_tags(document)), words_on_line)
6
+ CGI::unescapeHTML(
7
+ remove_short_lines(
8
+ strip_all_tags(
9
+ remove_script_tags(document)), words_on_line))
7
10
  end
11
+ alias :sanitize :plain_text
8
12
 
9
13
  def strip_all_tags(text)
10
14
  text.gsub(/<\/?[^>]*>/, "")
@@ -26,13 +30,11 @@ module Jkl
26
30
  def remove_short_lines(text, words_on_line = 5)
27
31
  text = text.gsub(/\s\s/, "\n")
28
32
  str = ""
29
- # remove short lines - ususally just navigation
30
- text.split("\n").each do |l|
31
- str << l unless l.count(" ") < words_on_line
33
+ text.split("\n").each do |line|
34
+ str << "#{line}\n" unless line.count(" ") < words_on_line
32
35
  end
33
36
  str
34
37
  end
35
-
36
38
  end
37
39
  end
38
40
  end
@@ -2,13 +2,12 @@ require "test/unit"
2
2
  require "shoulda"
3
3
  require "webmock/test_unit"
4
4
  require "yaml"
5
-
6
5
  require_relative "../../lib/jkl"
7
6
 
8
7
  class JklTest < Test::Unit::TestCase
9
8
  include WebMock::API
10
9
 
11
- context "for documents, plain text and tags" do
10
+ context "Jkl: When handling documents, plain text and tags" do
12
11
  setup do
13
12
  @url = "http://www.bbc.co.uk"
14
13
  response = File.read('test/fixtures/bbc_story.html')
@@ -18,27 +17,27 @@ class JklTest < Test::Unit::TestCase
18
17
  :headers => {'Content-Type' => 'text/html'})
19
18
  end
20
19
 
21
- should "Get a document from a URL" do
20
+ should "get a document from a URL" do
22
21
  doc = Jkl::get(@url)
23
22
  assert_not_nil doc
24
23
  end
25
24
 
26
- should "Get the plain text version of a document" do
25
+ should "get the plain text version of a document" do
27
26
  document = Jkl::get(@url)
28
27
  text = Jkl::Text::plain_text(document,2)
29
- assert_equal 8884, text.length
28
+ assert_equal 9009, text.length
30
29
  end
31
30
 
32
- should "Get the keywords from a document" do
31
+ should "get the keywords from a document" do
33
32
  document = Jkl::get(@url)
34
33
  text = Jkl::Text::plain_text(document,2)
35
- tags = Jkl::Extraction::tags(calais_key, text)
34
+ tags = Jkl::tags(text)
36
35
  assert ! tags.empty?
37
36
  end
38
37
  end
39
38
 
40
- context "for RSS" do
41
- should "Get links from a feed" do
39
+ context "Jkl: When handling RSS" do
40
+ should "get links from a feed" do
42
41
  feed = "http://feeds.bbci.co.uk/news/rss.xml"
43
42
  response = File.read('test/fixtures/topix_rss.xml')
44
43
  stub_request(:get, "http://feeds.bbci.co.uk/news/rss.xml").
@@ -13,7 +13,7 @@ the cat sat on the mat
13
13
  a short line
14
14
  HTML
15
15
  result = Jkl::Text::remove_short_lines input
16
- assert result == "the cat sat on the mat"
16
+ assert result == "the cat sat on the mat\n"
17
17
  end
18
18
 
19
19
  should "Remove shorter lines" do
@@ -23,7 +23,8 @@ the cat sat on the slightly fluffy, yet worn and homely mat
23
23
  a short line
24
24
  HTML
25
25
  result = Jkl::Text::remove_short_lines(input, 8)
26
- assert result == "the cat sat on the slightly fluffy, yet worn and homely mat"
26
+ expected = "the cat sat on the slightly fluffy, yet worn and homely mat\n"
27
+ assert_equal expected, result
27
28
  end
28
29
 
29
30
  should "Remove script tags" do
@@ -35,7 +36,7 @@ function nofunction(){var bob;}
35
36
  a short line
36
37
  HTML
37
38
  result = Jkl::Text::remove_short_lines input
38
- assert result == "the cat sat on the mat"
39
+ assert result == "the cat sat on the mat\n"
39
40
  end
40
41
 
41
42
  should "Remove html comments" do
@@ -45,7 +46,7 @@ the cat sat on the mat
45
46
  a short line
46
47
  HTML
47
48
  result = Jkl::Text::remove_short_lines input
48
- assert result == "the cat sat on the mat"
49
+ assert result == "the cat sat on the mat\n"
49
50
  end
50
51
 
51
52
  should "Remove blank lines" do
@@ -55,7 +56,7 @@ the cat sat on the mat
55
56
  a short line
56
57
  HTML
57
58
  result = Jkl::Text::remove_short_lines input
58
- assert result == "the cat sat on the mat"
59
+ assert result == "the cat sat on the mat\n"
59
60
  end
60
61
 
61
62
  should "Strip all tags" do
@@ -63,7 +64,7 @@ HTML
63
64
  <p>the cat sat on the mat</p>
64
65
  HTML
65
66
  result = Jkl::Text::strip_all_tags input
66
- assert result == "the cat sat on the mat\n" #TODO fix carriage return
67
+ assert result == "the cat sat on the mat\n"
67
68
  end
68
69
 
69
70
  should "Clean text" do
@@ -76,7 +77,19 @@ the cat sat on the mat
76
77
  some end stuff here
77
78
  HTML
78
79
  result = Jkl::Text::plain_text(input)
79
- assert result == "the cat sat on the mat"
80
+ assert result == "the cat sat on the mat\n"
81
+ end
82
+
83
+ should "Remove HTML escaped characters" do
84
+ input = <<HTML
85
+ Testing, testing, one two three.
86
+ <p><strong>The cat didn&#39;t sit on the mat</strong></p>
87
+ HTML
88
+ expected = <<EXPECTED
89
+ Testing, testing, one two three.
90
+ The cat didn't sit on the mat
91
+ EXPECTED
92
+ assert_equal expected, Jkl::Text::plain_text(input, 2)
80
93
  end
81
94
  end
82
95
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 2
8
- - 0
9
- version: 0.2.0
8
+ - 1
9
+ version: 0.2.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - sshingler
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-28 00:00:00 +00:00
17
+ date: 2011-06-02 00:00:00 +00:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -63,18 +63,18 @@ dependencies:
63
63
  type: :runtime
64
64
  version_requirements: *id003
65
65
  - !ruby/object:Gem::Dependency
66
- name: calais
66
+ name: term-extract
67
67
  prerelease: false
68
68
  requirement: &id004 !ruby/object:Gem::Requirement
69
69
  none: false
70
70
  requirements:
71
- - - ">="
71
+ - - ~>
72
72
  - !ruby/object:Gem::Version
73
73
  segments:
74
74
  - 0
75
- - 0
76
- - 11
77
- version: 0.0.11
75
+ - 5
76
+ - 1
77
+ version: 0.5.1
78
78
  type: :runtime
79
79
  version_requirements: *id004
80
80
  description: Jakal is a Ruby library for tagging keywords from web pages.
@@ -92,9 +92,17 @@ files:
92
92
  - License.txt
93
93
  - README.md
94
94
  - Rakefile
95
+ - bin/convert_to_should_syntax
96
+ - bin/edit_json.rb
97
+ - bin/minitar
98
+ - bin/nokogiri
99
+ - bin/prettify_json.rb
100
+ - bin/rake
101
+ - bin/rdebug
102
+ - bin/term-extract
103
+ - bin/testrb
95
104
  - jkl.gemspec
96
105
  - lib/jkl.rb
97
- - lib/jkl/calais_client.rb
98
106
  - lib/jkl/rss_client.rb
99
107
  - lib/jkl/text_client.rb
100
108
  - test/fixtures/bbc_story.html
@@ -1,28 +0,0 @@
1
- require "calais"
2
-
3
- module Jkl
4
- module Extraction
5
- class << self
6
-
7
- def calais_response(key, text)
8
- Calais.process_document(
9
- :content => text,
10
- :license_id => key
11
- )
12
- end
13
-
14
- def entities(key,text)
15
- calais_response(key, text).entities.map{|e| {e.type => [e.attributes["name"]]}}
16
- end
17
-
18
- def tags(key, text)
19
- nested_list = {}
20
- entities(key,text).each do |a|
21
- nested_list = nested_list.merge!(a){ |key,v1,v2| v1+v2 }
22
- end
23
- nested_list
24
- end
25
-
26
- end
27
- end
28
- end