webtagger 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +49 -0
- data/Rakefile +55 -0
- data/VERSION +1 -0
- data/bin/webtagger +60 -0
- data/lib/httparty_icebox.rb +263 -0
- data/lib/webtagger.rb +133 -0
- data/test/helper.rb +10 -0
- data/test/test_webtagger.rb +7 -0
- metadata +109 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 lfborjas
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
= webtagger
|
2
|
+
|
3
|
+
Webtagger is a simple ruby gem that uses the web intelligence to extract important terms in texts, suitable for tagging them, finding the main subject or automatically building queries.
|
4
|
+
|
5
|
+
It depends on {httparty}[http://github.com/jnunemaker/httparty] and uses the following external APIs:
|
6
|
+
* {Yahoo term extraction}[http://developer.yahoo.com/search/content/V1/termExtraction.html]
|
7
|
+
* {Tag-the-net}[http://tagthe.net]
|
8
|
+
* {Alchemy API}[http://www.alchemyapi.com/api/keyword/textc.html]
|
9
|
+
|
10
|
+
And it's written to support any API in the future.
|
11
|
+
|
12
|
+
==Installation
|
13
|
+
gem install webtagger
|
14
|
+
|
15
|
+
==Usage
|
16
|
+
|
17
|
+
Ok, little caveat here, you might need an API-key for some of the services, so you might want to run
|
18
|
+
webtagger configure [service=token]
|
19
|
+
|
20
|
+
To get instructions on how to get and save the API keys. Or, you can pass them in the tagging method, like this
|
21
|
+
tags = WebTagger.tag(text, service="yahoo", token="YOUR-API-KEY")
|
22
|
+
|
23
|
+
Besides that pickle, the standard usage is really simple:
|
24
|
+
require 'webtagger'
|
25
|
+
text = "Hi, I'm text"
|
26
|
+
#you can use the default service (tagthe)
|
27
|
+
tags = WebTagger.tag(text)
|
28
|
+
#or choose whichever you want, if it isn't supported, falls back to the default, so you don't have
|
29
|
+
#to be on the look for exceptions
|
30
|
+
tags = WebTagger.tag(text,"yahoo")
|
31
|
+
|
32
|
+
If something funny happens when calling an API, a `WebTaggerException` will be raised, and the instance of it will count with a `response` attribute to see what the original error response was. P.e
|
33
|
+
|
34
|
+
If a http error happens (404, 500, etc), `nil` will be returned.
|
35
|
+
|
36
|
+
|
37
|
+
== Note on Patches/Pull Requests
|
38
|
+
|
39
|
+
* Fork the project.
|
40
|
+
* Make your feature addition or bug fix.
|
41
|
+
* Add tests for it. This is important so I don't break it in a
|
42
|
+
future version unintentionally.
|
43
|
+
* Commit, do not mess with rakefile, version, or history.
|
44
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
45
|
+
* Send me a pull request. Bonus points for topic branches.
|
46
|
+
|
47
|
+
== Copyright
|
48
|
+
|
49
|
+
Copyright (c) 2010 lfborjas. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "webtagger"
|
8
|
+
gem.summary = %Q{Use some popular web services to extract keywords from text}
|
9
|
+
gem.description = %Q{Use webtagger to hace easy access to keyword extraction web services (tagthe.net, yahoo and alchemy)}
|
10
|
+
gem.email = "me@lfborjas.com"
|
11
|
+
gem.homepage = "http://github.com/lfborjas/webtagger"
|
12
|
+
gem.authors = ["lfborjas"]
|
13
|
+
gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
14
|
+
gem.add_dependency "httparty", "0.6.1"
|
15
|
+
gem.executables << 'webtagger'
|
16
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'rake/testtask'
|
24
|
+
Rake::TestTask.new(:test) do |test|
|
25
|
+
test.libs << 'lib' << 'test'
|
26
|
+
test.pattern = 'test/**/test_*.rb'
|
27
|
+
test.verbose = true
|
28
|
+
end
|
29
|
+
|
30
|
+
begin
|
31
|
+
require 'rcov/rcovtask'
|
32
|
+
Rcov::RcovTask.new do |test|
|
33
|
+
test.libs << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
rescue LoadError
|
38
|
+
task :rcov do
|
39
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
task :test => :check_dependencies
|
44
|
+
|
45
|
+
task :default => :test
|
46
|
+
|
47
|
+
require 'rake/rdoctask'
|
48
|
+
Rake::RDocTask.new do |rdoc|
|
49
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
50
|
+
|
51
|
+
rdoc.rdoc_dir = 'rdoc'
|
52
|
+
rdoc.title = "webtagger #{version}"
|
53
|
+
rdoc.rdoc_files.include('README*')
|
54
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
55
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/bin/webtagger
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'fileutils'
|
4
|
+
$:.unshift File.dirname(__FILE__) + "/../lib"
|
5
|
+
|
6
|
+
require 'webtagger'
|
7
|
+
|
8
|
+
service = ""
|
9
|
+
|
10
|
+
def configure
|
11
|
+
WebTagger::SERVICES.each do |service|
|
12
|
+
next if service == "tagthe"
|
13
|
+
conf = File.join(ENV['HOME'], '.webtagger')
|
14
|
+
FileUtils.touch(conf) unless File.exist? conf
|
15
|
+
srvcs = {}
|
16
|
+
File.open(conf).each do |service_conf|
|
17
|
+
s, t = service_conf.split(/\s*=\s*/) rescue next
|
18
|
+
srvcs[s.strip.downcase] = t ? t.strip : ""
|
19
|
+
end
|
20
|
+
puts "Token for #{service.downcase} (leave blank if you don't want to set it now or you already did): "
|
21
|
+
token = gets
|
22
|
+
srvcs[service]= (token and not token.strip.empty?) ? token : srvcs[service] || ""
|
23
|
+
File.open(conf,'w') do |new_conf|
|
24
|
+
srvcs.each do |s, t|
|
25
|
+
new_conf.write("#{s.upcase}=#{t.strip}\n")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
OptionParser.new do |opt|
|
32
|
+
opt.banner = "usage: webtagger [OPTIONS] [text]"
|
33
|
+
opt.on('-c', '--configure', String, "Add tokens for each service") do
|
34
|
+
configure()
|
35
|
+
exit
|
36
|
+
end
|
37
|
+
|
38
|
+
opt.on('-t', '--token=[service]', String, "Get the token of a specific service (or all if not specified)") do |s|
|
39
|
+
s="all" if not s or s.empty?
|
40
|
+
puts WebTagger.get_token(s)
|
41
|
+
exit
|
42
|
+
end
|
43
|
+
opt.on('-s', '--service=[service]', String, "Tag the text with the specified service (defaults to tagthe)") do |s|
|
44
|
+
s="" unless WebTagger::SERVICES.include?(s)
|
45
|
+
service = s
|
46
|
+
end
|
47
|
+
opt.on('-h', '--help', "Display the help screen and exit") do
|
48
|
+
puts opt
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
|
52
|
+
end.parse!
|
53
|
+
|
54
|
+
#do the actual tagging:
|
55
|
+
text = ARGV[0]
|
56
|
+
if text and not text.empty?
|
57
|
+
puts "tags: %s"%WebTagger.tag(text, service).inspect[1..-2] rescue puts "Couldn't extract tags"
|
58
|
+
else
|
59
|
+
puts "You must supply some text to tag!"
|
60
|
+
end
|
@@ -0,0 +1,263 @@
|
|
1
|
+
# = Icebox : Caching for HTTParty
|
2
|
+
#
|
3
|
+
# Cache responses in HTTParty models [http://github.com/jnunemaker/httparty]
|
4
|
+
#
|
5
|
+
# === Usage
|
6
|
+
#
|
7
|
+
# class Foo
|
8
|
+
# include HTTParty
|
9
|
+
# include HTTParty::Icebox
|
10
|
+
# cache :store => 'file', :timeout => 600, :location => MY_APP_ROOT.join('tmp', 'cache')
|
11
|
+
# end
|
12
|
+
#
|
13
|
+
# Modeled after Martyn Loughran's APICache [http://github.com/newbamboo/api_cache]
|
14
|
+
# and Ruby On Rails's caching [http://api.rubyonrails.org/classes/ActiveSupport/Cache.html]
|
15
|
+
#
|
16
|
+
# Author: Karel Minarik [www.karmi.cz]
|
17
|
+
#
|
18
|
+
# === Notes
|
19
|
+
#
|
20
|
+
# Thanks to Amit Chakradeo for pointing out response objects have to be stored marhalled on FS
|
21
|
+
# Thanks to Marlin Forbes for pointing out the query parameters have to be included in the cache key
|
22
|
+
#
|
23
|
+
#
|
24
|
+
|
25
|
+
require 'logger'
|
26
|
+
require 'ftools'
|
27
|
+
require 'tmpdir'
|
28
|
+
require 'pathname'
|
29
|
+
require 'digest/md5'
|
30
|
+
|
31
|
+
module HTTParty #:nodoc:
|
32
|
+
# == Caching for HTTParty
|
33
|
+
# See documentation in HTTParty::Icebox::ClassMethods.cache
|
34
|
+
#
|
35
|
+
module Icebox
|
36
|
+
|
37
|
+
module ClassMethods
|
38
|
+
|
39
|
+
# Enable caching and set cache options
|
40
|
+
# Returns memoized cache object
|
41
|
+
#
|
42
|
+
# Following options are available, default values are in []:
|
43
|
+
#
|
44
|
+
# +store+:: Storage mechanism for cached data (memory, filesystem, your own) [memory]
|
45
|
+
# +timeout+:: Cache expiration in seconds [60]
|
46
|
+
# +logger+:: Path to logfile or logger instance [nil, silent]
|
47
|
+
#
|
48
|
+
# Any additional options are passed to the Cache constructor
|
49
|
+
#
|
50
|
+
# Usage:
|
51
|
+
#
|
52
|
+
# # Enable caching in HTTParty, in memory, for 1 minute
|
53
|
+
# cache # Use default values
|
54
|
+
#
|
55
|
+
# # Enable caching in HTTParty, on filesystem (/tmp), for 10 minutes
|
56
|
+
# cache :store => 'file', :timeout => 600, :location => '/tmp/'
|
57
|
+
#
|
58
|
+
# # Use your own cache store (see +AbstractStore+ class below)
|
59
|
+
# cache :store => 'memcached', :timeout => 600, :server => '192.168.1.1:1001'
|
60
|
+
#
|
61
|
+
def cache(options={})
|
62
|
+
options[:store] ||= 'memory'
|
63
|
+
options[:timeout] ||= 60
|
64
|
+
logger = options[:logger]
|
65
|
+
@cache ||= Cache.new( options.delete(:store), options )
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
# When included, extend class with +cache+ method
|
71
|
+
# and redefine +get+ method to use cache
|
72
|
+
#
|
73
|
+
def self.included(receiver) #:nodoc:
|
74
|
+
receiver.extend ClassMethods
|
75
|
+
receiver.class_eval do
|
76
|
+
|
77
|
+
# Get reponse from network
|
78
|
+
#
|
79
|
+
# TODO: Why alias :new :old is not working here? Returns NoMethodError
|
80
|
+
#
|
81
|
+
def self.get_without_caching(path, options={})
|
82
|
+
perform_request Net::HTTP::Get, path, options
|
83
|
+
end
|
84
|
+
|
85
|
+
# Get response from cache, if available
|
86
|
+
#
|
87
|
+
def self.get_with_caching(path, options={})
|
88
|
+
key = path
|
89
|
+
key << options[:query].to_s if defined? options[:query]
|
90
|
+
if cache.exists?(key) and not cache.stale?(key)
|
91
|
+
Cache.logger.debug "CACHE -- GET #{path}#{options[:query]}"
|
92
|
+
return cache.get(key)
|
93
|
+
else
|
94
|
+
Cache.logger.debug "/!\\ NETWORK -- GET #{path}#{options[:query]}"
|
95
|
+
response = get_without_caching(path, options)
|
96
|
+
cache.set(key, response) if response.code == 200
|
97
|
+
return response
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Redefine original HTTParty +get+ method to use cache
|
102
|
+
#
|
103
|
+
def self.get(path, options={})
|
104
|
+
self.get_with_caching(path, options={})
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# === Cache container
|
111
|
+
#
|
112
|
+
# Pass a store name ('memory', etc) to new
|
113
|
+
#
|
114
|
+
class Cache
|
115
|
+
attr_accessor :store
|
116
|
+
|
117
|
+
def initialize(store, options={})
|
118
|
+
self.class.logger = options[:logger]
|
119
|
+
@store = self.class.lookup_store(store).new(options)
|
120
|
+
end
|
121
|
+
|
122
|
+
def get(key); @store.get encode(key) unless stale?(key); end
|
123
|
+
def set(key, value); @store.set encode(key), value; end
|
124
|
+
def exists?(key); @store.exists? encode(key); end
|
125
|
+
def stale?(key); @store.stale? encode(key); end
|
126
|
+
|
127
|
+
def self.logger; @logger || default_logger; end
|
128
|
+
def self.default_logger; logger = ::Logger.new(STDERR); end
|
129
|
+
|
130
|
+
# Pass a filename (String), IO object, Logger instance or +nil+ to silence the logger
|
131
|
+
def self.logger=(device); @logger = device.kind_of?(::Logger) ? device : ::Logger.new(device); end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
# Return store class based on passed name
|
136
|
+
def self.lookup_store(name)
|
137
|
+
store_name = "#{name.capitalize}Store"
|
138
|
+
return Store::const_get(store_name)
|
139
|
+
rescue NameError => e
|
140
|
+
raise Store::StoreNotFound, "The cache store '#{store_name}' was not found. Did you loaded any such class?"
|
141
|
+
end
|
142
|
+
|
143
|
+
def encode(key); Digest::MD5.hexdigest(key); end
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
# === Cache stores
|
148
|
+
#
|
149
|
+
module Store
|
150
|
+
|
151
|
+
class StoreNotFound < StandardError; end #:nodoc:
|
152
|
+
|
153
|
+
# ==== Abstract Store
|
154
|
+
# Inherit your store from this class
|
155
|
+
# *IMPORTANT*: Do not forget to call +super+ in your +initialize+ method!
|
156
|
+
#
|
157
|
+
class AbstractStore
|
158
|
+
def initialize(options={})
|
159
|
+
raise ArgumentError, "You need to set the :timeout parameter" unless options[:timeout]
|
160
|
+
@timeout = options[:timeout]
|
161
|
+
message = "Cache: Using #{self.class.to_s.split('::').last}"
|
162
|
+
message << " in location: #{options[:location]}" if options[:location]
|
163
|
+
message << " with timeout #{options[:timeout]} sec"
|
164
|
+
Cache.logger.info message unless options[:logger].nil?
|
165
|
+
return self
|
166
|
+
end
|
167
|
+
%w{set get exists? stale?}.each do |method_name|
|
168
|
+
define_method(method_name) { raise NoMethodError, "Please implement method #{method_name} in your store class" }
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
# ==== Store objects in memory
|
173
|
+
# See HTTParty::Icebox::ClassMethods.cache
|
174
|
+
#
|
175
|
+
class MemoryStore < AbstractStore
|
176
|
+
def initialize(options={})
|
177
|
+
super; @store = {}; self
|
178
|
+
end
|
179
|
+
def set(key, value)
|
180
|
+
Cache.logger.info("Cache: set (#{key})")
|
181
|
+
@store[key] = [Time.now, value]; true
|
182
|
+
end
|
183
|
+
def get(key)
|
184
|
+
data = @store[key][1]
|
185
|
+
Cache.logger.info("Cache: #{data.nil? ? "miss" : "hit"} (#{key})")
|
186
|
+
data
|
187
|
+
end
|
188
|
+
def exists?(key)
|
189
|
+
!@store[key].nil?
|
190
|
+
end
|
191
|
+
def stale?(key)
|
192
|
+
return true unless exists?(key)
|
193
|
+
Time.now - created(key) > @timeout
|
194
|
+
end
|
195
|
+
private
|
196
|
+
def created(key)
|
197
|
+
@store[key][0]
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# ==== Store objects on the filesystem
|
202
|
+
# See HTTParty::Icebox::ClassMethods.cache
|
203
|
+
#
|
204
|
+
class FileStore < AbstractStore
|
205
|
+
def initialize(options={})
|
206
|
+
super
|
207
|
+
options[:location] ||= Dir::tmpdir
|
208
|
+
@path = Pathname.new( options[:location] )
|
209
|
+
FileUtils.mkdir_p( @path )
|
210
|
+
self
|
211
|
+
end
|
212
|
+
def set(key, value)
|
213
|
+
Cache.logger.info("Cache: set (#{key})")
|
214
|
+
File.open( @path.join(key), 'w' ) { |file| file << Marshal.dump(value) }
|
215
|
+
true
|
216
|
+
end
|
217
|
+
def get(key)
|
218
|
+
data = Marshal.load(File.read( @path.join(key)))
|
219
|
+
Cache.logger.info("Cache: #{data.nil? ? "miss" : "hit"} (#{key})")
|
220
|
+
data
|
221
|
+
end
|
222
|
+
def exists?(key)
|
223
|
+
File.exists?( @path.join(key) )
|
224
|
+
end
|
225
|
+
def stale?(key)
|
226
|
+
return true unless exists?(key)
|
227
|
+
Time.now - created(key) > @timeout
|
228
|
+
end
|
229
|
+
private
|
230
|
+
def created(key)
|
231
|
+
File.mtime( @path.join(key) )
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
|
240
|
+
# Major parts of this code are based on architecture of ApiCache.
|
241
|
+
# Copyright (c) 2008 Martyn Loughran
|
242
|
+
#
|
243
|
+
# Other parts are inspired by the ActiveSupport::Cache in Ruby On Rails.
|
244
|
+
# Copyright (c) 2005-2009 David Heinemeier Hansson
|
245
|
+
#
|
246
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
247
|
+
# a copy of this software and associated documentation files (the
|
248
|
+
# "Software"), to deal in the Software without restriction, including
|
249
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
250
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
251
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
252
|
+
# the following conditions:
|
253
|
+
#
|
254
|
+
# The above copyright notice and this permission notice shall be
|
255
|
+
# included in all copies or substantial portions of the Software.
|
256
|
+
#
|
257
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
258
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
259
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
260
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
261
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
262
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
263
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/lib/webtagger.rb
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'httparty'
|
3
|
+
require 'httparty_icebox'
|
4
|
+
|
5
|
+
#Module for extracting keywords from text. Uses the tagthe, yahoo and alchemyAPI web services.
|
6
|
+
#Because the yahoo and alchemy services require an API key, a command line utility is provided
|
7
|
+
#to add those tokens for subsequent uses of the modules, storing them in <tt>~/.webtagger</tt>
|
8
|
+
#it uses caching to avoid being throttled by the apis, via the httparty_icebox gem
|
9
|
+
module WebTagger
|
10
|
+
|
11
|
+
#The services supported by this version
|
12
|
+
SERVICES = ['yahoo', 'alchemy', 'tagthe']
|
13
|
+
|
14
|
+
#A generic exception to handle api call errors
|
15
|
+
class WebTaggerError < RuntimeError
|
16
|
+
attr :response
|
17
|
+
def initialize(resp)
|
18
|
+
@response = resp
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
#Get the persisted token for a service, if no service is provided, all tokens are returned in a hash
|
23
|
+
#Params:
|
24
|
+
#+service+:: the service for which the token should be retrieved, must be one of SERVICES
|
25
|
+
def get_token(service="")
|
26
|
+
service = service.strip.downcase
|
27
|
+
conf = File.join(ENV['HOME'], '.webtagger')
|
28
|
+
return nil unless File.exist? conf
|
29
|
+
srvcs = {}
|
30
|
+
File.open(conf).each do |service_conf|
|
31
|
+
s, t = service_conf.split(/\s*=\s*/) rescue next
|
32
|
+
srvcs[s.strip.downcase] = t.strip
|
33
|
+
end
|
34
|
+
|
35
|
+
return case
|
36
|
+
when service == "all"
|
37
|
+
srvcs
|
38
|
+
when (SERVICES.include?(service) and srvcs[service])
|
39
|
+
srvcs[service]
|
40
|
+
else
|
41
|
+
nil
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
#Class to access the
|
46
|
+
#{yahoo term extraction web service}[http://developer.yahoo.com/search/content/V1/termExtraction.html]
|
47
|
+
class Yahoo
|
48
|
+
include HTTParty
|
49
|
+
include HTTParty::Icebox
|
50
|
+
format :json
|
51
|
+
base_uri "http://search.yahooapis.com/ContentAnalysisService/V1"
|
52
|
+
cache :store => 'memory', :timeout => 1
|
53
|
+
|
54
|
+
def self.tag(text, token)
|
55
|
+
raise "Token missing!" unless token
|
56
|
+
resp = post("/termExtraction", :query => {:appid => token, :context => text, :output=>'json'} )
|
57
|
+
if resp.has_key?('ResultSet')
|
58
|
+
return resp['ResultSet']['Result'] || []
|
59
|
+
else
|
60
|
+
raise WebTaggerError.new(resp), "Error in API call"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
#Class for accessing the
|
66
|
+
#{alchemy keyword extraction service}[http://www.alchemyapi.com/api/keyword/textc.html]
|
67
|
+
class Alchemy
|
68
|
+
include HTTParty
|
69
|
+
include HTTParty::Icebox
|
70
|
+
format :json
|
71
|
+
base_uri "http://access.alchemyapi.com/calls/text"
|
72
|
+
cache :store => 'memory', :timeout => 1
|
73
|
+
|
74
|
+
def self.tag(text, token)
|
75
|
+
raise "Token missing!" unless token
|
76
|
+
resp = post("/TextGetRankedKeywords", :query => {:apikey => token, :text => text, :outputMode=>'json'} )
|
77
|
+
if resp['status'] != 'ERROR'
|
78
|
+
#it's a hash array of [{:text=>"", :relevance=>""}]
|
79
|
+
kws = []
|
80
|
+
resp['keywords'].each do |m|
|
81
|
+
kws.push m["text"]
|
82
|
+
end
|
83
|
+
return kws
|
84
|
+
else
|
85
|
+
raise WebTaggerError.new(resp), "Error in API call"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
#class for accesing the
|
91
|
+
#{tagthe API}[http://tagthe.net/fordevelopers]
|
92
|
+
class Tagthe
|
93
|
+
include HTTParty
|
94
|
+
include HTTParty::Icebox
|
95
|
+
format :json
|
96
|
+
base_uri "http://tagthe.net/api"
|
97
|
+
cache :store => 'memory', :timeout => 1
|
98
|
+
|
99
|
+
def self.tag(text)
|
100
|
+
resp = post("/", :query => {:text => text, :view=>'json'} )
|
101
|
+
if resp.has_key?('memes') and resp['memes'][0].has_key?('dimensions') \
|
102
|
+
and resp['memes'][0]['dimensions'].has_key?('topic')
|
103
|
+
|
104
|
+
return resp['memes'][0]['dimensions']['topic']
|
105
|
+
else
|
106
|
+
return []
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
#Method for obtaining keywords in a text
|
112
|
+
#Params:
|
113
|
+
#+text+:: a +String+, the text to tag
|
114
|
+
#+service+(optional):: a +String+, the name of the service to use, defaults to tagthe and must be one of SERVICES
|
115
|
+
#+token+(optional):: a token to use for calling the service (tagthe doesn't need one), keep in mind that this value,
|
116
|
+
#superseeds the one stored in +~/.webtagger+ and that, due to caching, might not be used if the request is done
|
117
|
+
#less than a minute after the last one with a different token
|
118
|
+
def tag(text,service="tagthe",token=nil)
|
119
|
+
service = service.strip.downcase
|
120
|
+
token = get_token(service) unless token
|
121
|
+
return case
|
122
|
+
when service == "yahoo"
|
123
|
+
Yahoo.tag(text, token)
|
124
|
+
when service == "alchemy"
|
125
|
+
Alchemy.tag(text, token)
|
126
|
+
else
|
127
|
+
Tagthe.tag(text)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
module_function :tag
|
132
|
+
module_function :get_token
|
133
|
+
end #of webtagger module
|
data/test/helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: webtagger
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- lfborjas
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-08-28 00:00:00 -06:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: thoughtbot-shoulda
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :development
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: httparty
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - "="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 5
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
- 6
|
47
|
+
- 1
|
48
|
+
version: 0.6.1
|
49
|
+
type: :runtime
|
50
|
+
version_requirements: *id002
|
51
|
+
description: Use webtagger to hace easy access to keyword extraction web services (tagthe.net, yahoo and alchemy)
|
52
|
+
email: me@lfborjas.com
|
53
|
+
executables:
|
54
|
+
- webtagger
|
55
|
+
- webtagger
|
56
|
+
extensions: []
|
57
|
+
|
58
|
+
extra_rdoc_files:
|
59
|
+
- LICENSE
|
60
|
+
- README.rdoc
|
61
|
+
files:
|
62
|
+
- .document
|
63
|
+
- .gitignore
|
64
|
+
- LICENSE
|
65
|
+
- README.rdoc
|
66
|
+
- Rakefile
|
67
|
+
- VERSION
|
68
|
+
- bin/webtagger
|
69
|
+
- lib/httparty_icebox.rb
|
70
|
+
- lib/webtagger.rb
|
71
|
+
- test/helper.rb
|
72
|
+
- test/test_webtagger.rb
|
73
|
+
has_rdoc: true
|
74
|
+
homepage: http://github.com/lfborjas/webtagger
|
75
|
+
licenses: []
|
76
|
+
|
77
|
+
post_install_message:
|
78
|
+
rdoc_options:
|
79
|
+
- --charset=UTF-8
|
80
|
+
require_paths:
|
81
|
+
- lib
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
hash: 3
|
88
|
+
segments:
|
89
|
+
- 0
|
90
|
+
version: "0"
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
hash: 3
|
97
|
+
segments:
|
98
|
+
- 0
|
99
|
+
version: "0"
|
100
|
+
requirements: []
|
101
|
+
|
102
|
+
rubyforge_project:
|
103
|
+
rubygems_version: 1.3.7
|
104
|
+
signing_key:
|
105
|
+
specification_version: 3
|
106
|
+
summary: Use some popular web services to extract keywords from text
|
107
|
+
test_files:
|
108
|
+
- test/helper.rb
|
109
|
+
- test/test_webtagger.rb
|