webtagger 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +49 -0
- data/Rakefile +55 -0
- data/VERSION +1 -0
- data/bin/webtagger +60 -0
- data/lib/httparty_icebox.rb +263 -0
- data/lib/webtagger.rb +133 -0
- data/test/helper.rb +10 -0
- data/test/test_webtagger.rb +7 -0
- metadata +109 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 lfborjas
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
= webtagger
|
2
|
+
|
3
|
+
Webtagger is a simple ruby gem that uses the web intelligence to extract important terms in texts, suitable for tagging them, finding the main subject or automatically building queries.
|
4
|
+
|
5
|
+
It depends on {httparty}[http://github.com/jnunemaker/httparty] and uses the following external APIs:
|
6
|
+
* {Yahoo term extraction}[http://developer.yahoo.com/search/content/V1/termExtraction.html]
|
7
|
+
* {Tag-the-net}[http://tagthe.net]
|
8
|
+
* {Alchemy API}[http://www.alchemyapi.com/api/keyword/textc.html]
|
9
|
+
|
10
|
+
And it's written to support any API in the future.
|
11
|
+
|
12
|
+
==Installation
|
13
|
+
gem install webtagger
|
14
|
+
|
15
|
+
==Usage
|
16
|
+
|
17
|
+
Ok, little caveat here, you might need an API-key for some of the services, so you might want to run
|
18
|
+
webtagger configure [service=token]
|
19
|
+
|
20
|
+
To get instructions on how to get and save the API keys. Or, you can pass them in the tagging method, like this
|
21
|
+
tags = WebTagger.tag(text, service="yahoo", token="YOUR-API-KEY")
|
22
|
+
|
23
|
+
Besides that pickle, the standard usage is really simple:
|
24
|
+
require 'webtagger'
|
25
|
+
text = "Hi, I'm text"
|
26
|
+
#you can use the default service (tagthe)
|
27
|
+
tags = WebTagger.tag(text)
|
28
|
+
#or choose whichever you want, if it isn't supported, falls back to the default, so you don't have
|
29
|
+
#to be on the look for exceptions
|
30
|
+
tags = WebTagger.tag(text,"yahoo")
|
31
|
+
|
32
|
+
If something funny happens when calling an API, a `WebTaggerException` will be raised, and the instance of it will count with a `response` attribute to see what the original error response was. P.e
|
33
|
+
|
34
|
+
If a http error happens (404, 500, etc), `nil` will be returned.
|
35
|
+
|
36
|
+
|
37
|
+
== Note on Patches/Pull Requests
|
38
|
+
|
39
|
+
* Fork the project.
|
40
|
+
* Make your feature addition or bug fix.
|
41
|
+
* Add tests for it. This is important so I don't break it in a
|
42
|
+
future version unintentionally.
|
43
|
+
* Commit, do not mess with rakefile, version, or history.
|
44
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
45
|
+
* Send me a pull request. Bonus points for topic branches.
|
46
|
+
|
47
|
+
== Copyright
|
48
|
+
|
49
|
+
Copyright (c) 2010 lfborjas. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "webtagger"
|
8
|
+
gem.summary = %Q{Use some popular web services to extract keywords from text}
|
9
|
+
gem.description = %Q{Use webtagger to hace easy access to keyword extraction web services (tagthe.net, yahoo and alchemy)}
|
10
|
+
gem.email = "me@lfborjas.com"
|
11
|
+
gem.homepage = "http://github.com/lfborjas/webtagger"
|
12
|
+
gem.authors = ["lfborjas"]
|
13
|
+
gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
14
|
+
gem.add_dependency "httparty", "0.6.1"
|
15
|
+
gem.executables << 'webtagger'
|
16
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'rake/testtask'
|
24
|
+
Rake::TestTask.new(:test) do |test|
|
25
|
+
test.libs << 'lib' << 'test'
|
26
|
+
test.pattern = 'test/**/test_*.rb'
|
27
|
+
test.verbose = true
|
28
|
+
end
|
29
|
+
|
30
|
+
begin
|
31
|
+
require 'rcov/rcovtask'
|
32
|
+
Rcov::RcovTask.new do |test|
|
33
|
+
test.libs << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
rescue LoadError
|
38
|
+
task :rcov do
|
39
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
task :test => :check_dependencies
|
44
|
+
|
45
|
+
task :default => :test
|
46
|
+
|
47
|
+
require 'rake/rdoctask'
|
48
|
+
Rake::RDocTask.new do |rdoc|
|
49
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
50
|
+
|
51
|
+
rdoc.rdoc_dir = 'rdoc'
|
52
|
+
rdoc.title = "webtagger #{version}"
|
53
|
+
rdoc.rdoc_files.include('README*')
|
54
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
55
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/bin/webtagger
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'fileutils'
|
4
|
+
$:.unshift File.dirname(__FILE__) + "/../lib"
|
5
|
+
|
6
|
+
require 'webtagger'
|
7
|
+
|
8
|
+
service = ""
|
9
|
+
|
10
|
+
def configure
|
11
|
+
WebTagger::SERVICES.each do |service|
|
12
|
+
next if service == "tagthe"
|
13
|
+
conf = File.join(ENV['HOME'], '.webtagger')
|
14
|
+
FileUtils.touch(conf) unless File.exist? conf
|
15
|
+
srvcs = {}
|
16
|
+
File.open(conf).each do |service_conf|
|
17
|
+
s, t = service_conf.split(/\s*=\s*/) rescue next
|
18
|
+
srvcs[s.strip.downcase] = t ? t.strip : ""
|
19
|
+
end
|
20
|
+
puts "Token for #{service.downcase} (leave blank if you don't want to set it now or you already did): "
|
21
|
+
token = gets
|
22
|
+
srvcs[service]= (token and not token.strip.empty?) ? token : srvcs[service] || ""
|
23
|
+
File.open(conf,'w') do |new_conf|
|
24
|
+
srvcs.each do |s, t|
|
25
|
+
new_conf.write("#{s.upcase}=#{t.strip}\n")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
OptionParser.new do |opt|
|
32
|
+
opt.banner = "usage: webtagger [OPTIONS] [text]"
|
33
|
+
opt.on('-c', '--configure', String, "Add tokens for each service") do
|
34
|
+
configure()
|
35
|
+
exit
|
36
|
+
end
|
37
|
+
|
38
|
+
opt.on('-t', '--token=[service]', String, "Get the token of a specific service (or all if not specified)") do |s|
|
39
|
+
s="all" if not s or s.empty?
|
40
|
+
puts WebTagger.get_token(s)
|
41
|
+
exit
|
42
|
+
end
|
43
|
+
opt.on('-s', '--service=[service]', String, "Tag the text with the specified service (defaults to tagthe)") do |s|
|
44
|
+
s="" unless WebTagger::SERVICES.include?(s)
|
45
|
+
service = s
|
46
|
+
end
|
47
|
+
opt.on('-h', '--help', "Display the help screen and exit") do
|
48
|
+
puts opt
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
|
52
|
+
end.parse!
|
53
|
+
|
54
|
+
#do the actual tagging:
|
55
|
+
text = ARGV[0]
|
56
|
+
if text and not text.empty?
|
57
|
+
puts "tags: %s"%WebTagger.tag(text, service).inspect[1..-2] rescue puts "Couldn't extract tags"
|
58
|
+
else
|
59
|
+
puts "You must supply some text to tag!"
|
60
|
+
end
|
@@ -0,0 +1,263 @@
|
|
1
|
+
# = Icebox : Caching for HTTParty
|
2
|
+
#
|
3
|
+
# Cache responses in HTTParty models [http://github.com/jnunemaker/httparty]
|
4
|
+
#
|
5
|
+
# === Usage
|
6
|
+
#
|
7
|
+
# class Foo
|
8
|
+
# include HTTParty
|
9
|
+
# include HTTParty::Icebox
|
10
|
+
# cache :store => 'file', :timeout => 600, :location => MY_APP_ROOT.join('tmp', 'cache')
|
11
|
+
# end
|
12
|
+
#
|
13
|
+
# Modeled after Martyn Loughran's APICache [http://github.com/newbamboo/api_cache]
|
14
|
+
# and Ruby On Rails's caching [http://api.rubyonrails.org/classes/ActiveSupport/Cache.html]
|
15
|
+
#
|
16
|
+
# Author: Karel Minarik [www.karmi.cz]
|
17
|
+
#
|
18
|
+
# === Notes
|
19
|
+
#
|
20
|
+
# Thanks to Amit Chakradeo for pointing out response objects have to be stored marhalled on FS
|
21
|
+
# Thanks to Marlin Forbes for pointing out the query parameters have to be included in the cache key
|
22
|
+
#
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'logger'
|
26
|
+
require 'ftools'
|
27
|
+
require 'tmpdir'
|
28
|
+
require 'pathname'
|
29
|
+
require 'digest/md5'
|
30
|
+
|
31
|
+
module HTTParty #:nodoc:
|
32
|
+
# == Caching for HTTParty
|
33
|
+
# See documentation in HTTParty::Icebox::ClassMethods.cache
|
34
|
+
#
|
35
|
+
module Icebox
|
36
|
+
|
37
|
+
module ClassMethods
|
38
|
+
|
39
|
+
# Enable caching and set cache options
|
40
|
+
# Returns memoized cache object
|
41
|
+
#
|
42
|
+
# Following options are available, default values are in []:
|
43
|
+
#
|
44
|
+
# +store+:: Storage mechanism for cached data (memory, filesystem, your own) [memory]
|
45
|
+
# +timeout+:: Cache expiration in seconds [60]
|
46
|
+
# +logger+:: Path to logfile or logger instance [nil, silent]
|
47
|
+
#
|
48
|
+
# Any additional options are passed to the Cache constructor
|
49
|
+
#
|
50
|
+
# Usage:
|
51
|
+
#
|
52
|
+
# # Enable caching in HTTParty, in memory, for 1 minute
|
53
|
+
# cache # Use default values
|
54
|
+
#
|
55
|
+
# # Enable caching in HTTParty, on filesystem (/tmp), for 10 minutes
|
56
|
+
# cache :store => 'file', :timeout => 600, :location => '/tmp/'
|
57
|
+
#
|
58
|
+
# # Use your own cache store (see +AbstractStore+ class below)
|
59
|
+
# cache :store => 'memcached', :timeout => 600, :server => '192.168.1.1:1001'
|
60
|
+
#
|
61
|
+
def cache(options={})
|
62
|
+
options[:store] ||= 'memory'
|
63
|
+
options[:timeout] ||= 60
|
64
|
+
logger = options[:logger]
|
65
|
+
@cache ||= Cache.new( options.delete(:store), options )
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
# When included, extend class with +cache+ method
|
71
|
+
# and redefine +get+ method to use cache
|
72
|
+
#
|
73
|
+
def self.included(receiver) #:nodoc:
|
74
|
+
receiver.extend ClassMethods
|
75
|
+
receiver.class_eval do
|
76
|
+
|
77
|
+
# Get reponse from network
|
78
|
+
#
|
79
|
+
# TODO: Why alias :new :old is not working here? Returns NoMethodError
|
80
|
+
#
|
81
|
+
def self.get_without_caching(path, options={})
|
82
|
+
perform_request Net::HTTP::Get, path, options
|
83
|
+
end
|
84
|
+
|
85
|
+
# Get response from cache, if available
|
86
|
+
#
|
87
|
+
def self.get_with_caching(path, options={})
|
88
|
+
key = path
|
89
|
+
key << options[:query].to_s if defined? options[:query]
|
90
|
+
if cache.exists?(key) and not cache.stale?(key)
|
91
|
+
Cache.logger.debug "CACHE -- GET #{path}#{options[:query]}"
|
92
|
+
return cache.get(key)
|
93
|
+
else
|
94
|
+
Cache.logger.debug "/!\\ NETWORK -- GET #{path}#{options[:query]}"
|
95
|
+
response = get_without_caching(path, options)
|
96
|
+
cache.set(key, response) if response.code == 200
|
97
|
+
return response
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Redefine original HTTParty +get+ method to use cache
|
102
|
+
#
|
103
|
+
def self.get(path, options={})
|
104
|
+
self.get_with_caching(path, options={})
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# === Cache container
|
111
|
+
#
|
112
|
+
# Pass a store name ('memory', etc) to new
|
113
|
+
#
|
114
|
+
class Cache
|
115
|
+
attr_accessor :store
|
116
|
+
|
117
|
+
def initialize(store, options={})
|
118
|
+
self.class.logger = options[:logger]
|
119
|
+
@store = self.class.lookup_store(store).new(options)
|
120
|
+
end
|
121
|
+
|
122
|
+
def get(key); @store.get encode(key) unless stale?(key); end
|
123
|
+
def set(key, value); @store.set encode(key), value; end
|
124
|
+
def exists?(key); @store.exists? encode(key); end
|
125
|
+
def stale?(key); @store.stale? encode(key); end
|
126
|
+
|
127
|
+
def self.logger; @logger || default_logger; end
|
128
|
+
def self.default_logger; logger = ::Logger.new(STDERR); end
|
129
|
+
|
130
|
+
# Pass a filename (String), IO object, Logger instance or +nil+ to silence the logger
|
131
|
+
def self.logger=(device); @logger = device.kind_of?(::Logger) ? device : ::Logger.new(device); end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
# Return store class based on passed name
|
136
|
+
def self.lookup_store(name)
|
137
|
+
store_name = "#{name.capitalize}Store"
|
138
|
+
return Store::const_get(store_name)
|
139
|
+
rescue NameError => e
|
140
|
+
raise Store::StoreNotFound, "The cache store '#{store_name}' was not found. Did you loaded any such class?"
|
141
|
+
end
|
142
|
+
|
143
|
+
def encode(key); Digest::MD5.hexdigest(key); end
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
# === Cache stores
|
148
|
+
#
|
149
|
+
module Store
|
150
|
+
|
151
|
+
class StoreNotFound < StandardError; end #:nodoc:
|
152
|
+
|
153
|
+
# ==== Abstract Store
|
154
|
+
# Inherit your store from this class
|
155
|
+
# *IMPORTANT*: Do not forget to call +super+ in your +initialize+ method!
|
156
|
+
#
|
157
|
+
class AbstractStore
|
158
|
+
def initialize(options={})
|
159
|
+
raise ArgumentError, "You need to set the :timeout parameter" unless options[:timeout]
|
160
|
+
@timeout = options[:timeout]
|
161
|
+
message = "Cache: Using #{self.class.to_s.split('::').last}"
|
162
|
+
message << " in location: #{options[:location]}" if options[:location]
|
163
|
+
message << " with timeout #{options[:timeout]} sec"
|
164
|
+
Cache.logger.info message unless options[:logger].nil?
|
165
|
+
return self
|
166
|
+
end
|
167
|
+
%w{set get exists? stale?}.each do |method_name|
|
168
|
+
define_method(method_name) { raise NoMethodError, "Please implement method #{method_name} in your store class" }
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
# ==== Store objects in memory
|
173
|
+
# See HTTParty::Icebox::ClassMethods.cache
|
174
|
+
#
|
175
|
+
class MemoryStore < AbstractStore
|
176
|
+
def initialize(options={})
|
177
|
+
super; @store = {}; self
|
178
|
+
end
|
179
|
+
def set(key, value)
|
180
|
+
Cache.logger.info("Cache: set (#{key})")
|
181
|
+
@store[key] = [Time.now, value]; true
|
182
|
+
end
|
183
|
+
def get(key)
|
184
|
+
data = @store[key][1]
|
185
|
+
Cache.logger.info("Cache: #{data.nil? ? "miss" : "hit"} (#{key})")
|
186
|
+
data
|
187
|
+
end
|
188
|
+
def exists?(key)
|
189
|
+
!@store[key].nil?
|
190
|
+
end
|
191
|
+
def stale?(key)
|
192
|
+
return true unless exists?(key)
|
193
|
+
Time.now - created(key) > @timeout
|
194
|
+
end
|
195
|
+
private
|
196
|
+
def created(key)
|
197
|
+
@store[key][0]
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# ==== Store objects on the filesystem
|
202
|
+
# See HTTParty::Icebox::ClassMethods.cache
|
203
|
+
#
|
204
|
+
class FileStore < AbstractStore
|
205
|
+
def initialize(options={})
|
206
|
+
super
|
207
|
+
options[:location] ||= Dir::tmpdir
|
208
|
+
@path = Pathname.new( options[:location] )
|
209
|
+
FileUtils.mkdir_p( @path )
|
210
|
+
self
|
211
|
+
end
|
212
|
+
def set(key, value)
|
213
|
+
Cache.logger.info("Cache: set (#{key})")
|
214
|
+
File.open( @path.join(key), 'w' ) { |file| file << Marshal.dump(value) }
|
215
|
+
true
|
216
|
+
end
|
217
|
+
def get(key)
|
218
|
+
data = Marshal.load(File.read( @path.join(key)))
|
219
|
+
Cache.logger.info("Cache: #{data.nil? ? "miss" : "hit"} (#{key})")
|
220
|
+
data
|
221
|
+
end
|
222
|
+
def exists?(key)
|
223
|
+
File.exists?( @path.join(key) )
|
224
|
+
end
|
225
|
+
def stale?(key)
|
226
|
+
return true unless exists?(key)
|
227
|
+
Time.now - created(key) > @timeout
|
228
|
+
end
|
229
|
+
private
|
230
|
+
def created(key)
|
231
|
+
File.mtime( @path.join(key) )
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
|
240
|
+
# Major parts of this code are based on architecture of ApiCache.
|
241
|
+
# Copyright (c) 2008 Martyn Loughran
|
242
|
+
#
|
243
|
+
# Other parts are inspired by the ActiveSupport::Cache in Ruby On Rails.
|
244
|
+
# Copyright (c) 2005-2009 David Heinemeier Hansson
|
245
|
+
#
|
246
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
247
|
+
# a copy of this software and associated documentation files (the
|
248
|
+
# "Software"), to deal in the Software without restriction, including
|
249
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
250
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
251
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
252
|
+
# the following conditions:
|
253
|
+
#
|
254
|
+
# The above copyright notice and this permission notice shall be
|
255
|
+
# included in all copies or substantial portions of the Software.
|
256
|
+
#
|
257
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
258
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
259
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
260
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
261
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
262
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
263
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/lib/webtagger.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'httparty'
|
3
|
+
require 'httparty_icebox'
|
4
|
+
|
5
|
+
#Module for extracting keywords from text. Uses the tagthe, yahoo and alchemyAPI web services.
|
6
|
+
#Because the yahoo and alchemy services require an API key, a command line utility is provided
|
7
|
+
#to add those tokens for subsequent uses of the modules, storing them in <tt>~/.webtagger</tt>
|
8
|
+
#it uses caching to avoid being throttled by the apis, via the httparty_icebox gem
|
9
|
+
module WebTagger
|
10
|
+
|
11
|
+
#The services supported by this version
|
12
|
+
SERVICES = ['yahoo', 'alchemy', 'tagthe']
|
13
|
+
|
14
|
+
#A generic exception to handle api call errors
|
15
|
+
class WebTaggerError < RuntimeError
|
16
|
+
attr :response
|
17
|
+
def initialize(resp)
|
18
|
+
@response = resp
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
#Get the persisted token for a service, if no service is provided, all tokens are returned in a hash
|
23
|
+
#Params:
|
24
|
+
#+service+:: the service for which the token should be retrieved, must be one of SERVICES
|
25
|
+
def get_token(service="")
|
26
|
+
service = service.strip.downcase
|
27
|
+
conf = File.join(ENV['HOME'], '.webtagger')
|
28
|
+
return nil unless File.exist? conf
|
29
|
+
srvcs = {}
|
30
|
+
File.open(conf).each do |service_conf|
|
31
|
+
s, t = service_conf.split(/\s*=\s*/) rescue next
|
32
|
+
srvcs[s.strip.downcase] = t.strip
|
33
|
+
end
|
34
|
+
|
35
|
+
return case
|
36
|
+
when service == "all"
|
37
|
+
srvcs
|
38
|
+
when (SERVICES.include?(service) and srvcs[service])
|
39
|
+
srvcs[service]
|
40
|
+
else
|
41
|
+
nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
#Class to access the
|
46
|
+
#{yahoo term extraction web service}[http://developer.yahoo.com/search/content/V1/termExtraction.html]
|
47
|
+
class Yahoo
|
48
|
+
include HTTParty
|
49
|
+
include HTTParty::Icebox
|
50
|
+
format :json
|
51
|
+
base_uri "http://search.yahooapis.com/ContentAnalysisService/V1"
|
52
|
+
cache :store => 'memory', :timeout => 1
|
53
|
+
|
54
|
+
def self.tag(text, token)
|
55
|
+
raise "Token missing!" unless token
|
56
|
+
resp = post("/termExtraction", :query => {:appid => token, :context => text, :output=>'json'} )
|
57
|
+
if resp.has_key?('ResultSet')
|
58
|
+
return resp['ResultSet']['Result'] || []
|
59
|
+
else
|
60
|
+
raise WebTaggerError.new(resp), "Error in API call"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
#Class for accessing the
|
66
|
+
#{alchemy keyword extraction service}[http://www.alchemyapi.com/api/keyword/textc.html]
|
67
|
+
class Alchemy
|
68
|
+
include HTTParty
|
69
|
+
include HTTParty::Icebox
|
70
|
+
format :json
|
71
|
+
base_uri "http://access.alchemyapi.com/calls/text"
|
72
|
+
cache :store => 'memory', :timeout => 1
|
73
|
+
|
74
|
+
def self.tag(text, token)
|
75
|
+
raise "Token missing!" unless token
|
76
|
+
resp = post("/TextGetRankedKeywords", :query => {:apikey => token, :text => text, :outputMode=>'json'} )
|
77
|
+
if resp['status'] != 'ERROR'
|
78
|
+
#it's a hash array of [{:text=>"", :relevance=>""}]
|
79
|
+
kws = []
|
80
|
+
resp['keywords'].each do |m|
|
81
|
+
kws.push m["text"]
|
82
|
+
end
|
83
|
+
return kws
|
84
|
+
else
|
85
|
+
raise WebTaggerError.new(resp), "Error in API call"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
#class for accesing the
|
91
|
+
#{tagthe API}[http://tagthe.net/fordevelopers]
|
92
|
+
class Tagthe
|
93
|
+
include HTTParty
|
94
|
+
include HTTParty::Icebox
|
95
|
+
format :json
|
96
|
+
base_uri "http://tagthe.net/api"
|
97
|
+
cache :store => 'memory', :timeout => 1
|
98
|
+
|
99
|
+
def self.tag(text)
|
100
|
+
resp = post("/", :query => {:text => text, :view=>'json'} )
|
101
|
+
if resp.has_key?('memes') and resp['memes'][0].has_key?('dimensions') \
|
102
|
+
and resp['memes'][0]['dimensions'].has_key?('topic')
|
103
|
+
|
104
|
+
return resp['memes'][0]['dimensions']['topic']
|
105
|
+
else
|
106
|
+
return []
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
#Method for obtaining keywords in a text
|
112
|
+
#Params:
|
113
|
+
#+text+:: a +String+, the text to tag
|
114
|
+
#+service+(optional):: a +String+, the name of the service to use, defaults to tagthe and must be one of SERVICES
|
115
|
+
#+token+(optional):: a token to use for calling the service (tagthe doesn't need one), keep in mind that this value,
|
116
|
+
#superseeds the one stored in +~/.webtagger+ and that, due to caching, might not be used if the request is done
|
117
|
+
#less than a minute after the last one with a different token
|
118
|
+
def tag(text,service="tagthe",token=nil)
|
119
|
+
service = service.strip.downcase
|
120
|
+
token = get_token(service) unless token
|
121
|
+
return case
|
122
|
+
when service == "yahoo"
|
123
|
+
Yahoo.tag(text, token)
|
124
|
+
when service == "alchemy"
|
125
|
+
Alchemy.tag(text, token)
|
126
|
+
else
|
127
|
+
Tagthe.tag(text)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
module_function :tag
|
132
|
+
module_function :get_token
|
133
|
+
end #of webtagger module
|
data/test/helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webtagger
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- lfborjas
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-08-28 00:00:00 -06:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: thoughtbot-shoulda
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :development
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: httparty
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - "="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 5
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
- 6
|
47
|
+
- 1
|
48
|
+
version: 0.6.1
|
49
|
+
type: :runtime
|
50
|
+
version_requirements: *id002
|
51
|
+
description: Use webtagger to hace easy access to keyword extraction web services (tagthe.net, yahoo and alchemy)
|
52
|
+
email: me@lfborjas.com
|
53
|
+
executables:
|
54
|
+
- webtagger
|
55
|
+
- webtagger
|
56
|
+
extensions: []
|
57
|
+
|
58
|
+
extra_rdoc_files:
|
59
|
+
- LICENSE
|
60
|
+
- README.rdoc
|
61
|
+
files:
|
62
|
+
- .document
|
63
|
+
- .gitignore
|
64
|
+
- LICENSE
|
65
|
+
- README.rdoc
|
66
|
+
- Rakefile
|
67
|
+
- VERSION
|
68
|
+
- bin/webtagger
|
69
|
+
- lib/httparty_icebox.rb
|
70
|
+
- lib/webtagger.rb
|
71
|
+
- test/helper.rb
|
72
|
+
- test/test_webtagger.rb
|
73
|
+
has_rdoc: true
|
74
|
+
homepage: http://github.com/lfborjas/webtagger
|
75
|
+
licenses: []
|
76
|
+
|
77
|
+
post_install_message:
|
78
|
+
rdoc_options:
|
79
|
+
- --charset=UTF-8
|
80
|
+
require_paths:
|
81
|
+
- lib
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
hash: 3
|
88
|
+
segments:
|
89
|
+
- 0
|
90
|
+
version: "0"
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
hash: 3
|
97
|
+
segments:
|
98
|
+
- 0
|
99
|
+
version: "0"
|
100
|
+
requirements: []
|
101
|
+
|
102
|
+
rubyforge_project:
|
103
|
+
rubygems_version: 1.3.7
|
104
|
+
signing_key:
|
105
|
+
specification_version: 3
|
106
|
+
summary: Use some popular web services to extract keywords from text
|
107
|
+
test_files:
|
108
|
+
- test/helper.rb
|
109
|
+
- test/test_webtagger.rb
|