webtagger 0.1.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +15 -0
- data/Gemfile.lock +32 -0
- data/README.rdoc +8 -19
- data/Rakefile +23 -32
- data/VERSION +1 -1
- data/lib/webtagger.rb +69 -120
- data/spec/fixtures/alchemy.json +12 -0
- data/spec/fixtures/tagthe.json +1 -0
- data/spec/spec_helper.rb +18 -0
- data/spec/support_spec.rb +1 -0
- data/spec/webtagger_spec.rb +18 -0
- metadata +93 -29
- data/.gitignore +0 -21
- data/bin/webtagger +0 -60
- data/lib/httparty_icebox.rb +0 -263
- data/test/helper.rb +0 -10
- data/test/test_webtagger.rb +0 -7
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
gem 'json'
|
9
|
+
group :development do
|
10
|
+
gem "rspec", "~> 2.3.0"
|
11
|
+
gem "bundler", "~> 1.0.0"
|
12
|
+
gem "jeweler", "~> 1.5.2"
|
13
|
+
gem "rcov", ">= 0"
|
14
|
+
gem "fakeweb", "~> 1.3.0"
|
15
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.2)
|
5
|
+
fakeweb (1.3.0)
|
6
|
+
git (1.2.5)
|
7
|
+
jeweler (1.5.2)
|
8
|
+
bundler (~> 1.0.0)
|
9
|
+
git (>= 1.2.5)
|
10
|
+
rake
|
11
|
+
json (1.4.6)
|
12
|
+
rake (0.8.7)
|
13
|
+
rcov (0.9.9)
|
14
|
+
rspec (2.3.0)
|
15
|
+
rspec-core (~> 2.3.0)
|
16
|
+
rspec-expectations (~> 2.3.0)
|
17
|
+
rspec-mocks (~> 2.3.0)
|
18
|
+
rspec-core (2.3.1)
|
19
|
+
rspec-expectations (2.3.0)
|
20
|
+
diff-lcs (~> 1.1.2)
|
21
|
+
rspec-mocks (2.3.0)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
bundler (~> 1.0.0)
|
28
|
+
fakeweb (~> 1.3.0)
|
29
|
+
jeweler (~> 1.5.2)
|
30
|
+
json
|
31
|
+
rcov
|
32
|
+
rspec (~> 2.3.0)
|
data/README.rdoc
CHANGED
@@ -2,10 +2,9 @@
|
|
2
2
|
|
3
3
|
Webtagger is a simple ruby gem that uses the web intelligence to extract important terms in texts, suitable for tagging them, finding the main subject or automatically building queries.
|
4
4
|
|
5
|
-
It depends on {httparty}[http://github.com/jnunemaker/httparty] and uses the following external APIs:
|
6
5
|
* {Yahoo term extraction}[http://developer.yahoo.com/search/content/V1/termExtraction.html]
|
7
|
-
* {Tag-the-net}[http://tagthe.net]
|
8
|
-
* {Alchemy API}[http://www.alchemyapi.com/api/keyword/textc.html]
|
6
|
+
* {Tag-the-net}[http://tagthe.net] (Needs and API key!)
|
7
|
+
* {Alchemy API}[http://www.alchemyapi.com/api/keyword/textc.html] (Needs an API key!)
|
9
8
|
|
10
9
|
And it's written to support any API in the future.
|
11
10
|
|
@@ -14,29 +13,19 @@ And it's written to support any API in the future.
|
|
14
13
|
|
15
14
|
==Usage
|
16
15
|
|
17
|
-
Ok, little caveat here, you might need an API-key for some of the services, so you might want to run
|
18
|
-
webtagger --configure
|
19
|
-
|
20
|
-
To set or update your API keys
|
21
|
-
Or, you can pass them in the tagging method, like this
|
22
|
-
tags = WebTagger.tag(text, "yahoo", "YOUR-API-KEY")
|
23
|
-
|
24
16
|
Besides that pickle, the standard usage is really simple:
|
25
17
|
require 'webtagger'
|
26
18
|
text = "Hi, I'm text"
|
27
|
-
#you
|
28
|
-
tags = WebTagger.
|
29
|
-
#
|
30
|
-
|
31
|
-
|
19
|
+
#you simply call the appropriate method:
|
20
|
+
tags = WebTagger.tag_with_tagthe(text)
|
21
|
+
#some APIs might need an api key, pass that as the second parameter
|
22
|
+
tags = WebTagger.tag_with_yahoo(text, "YOUR-API-KEY")
|
23
|
+
|
32
24
|
|
33
25
|
WebTagger uses caching so rest assured you won't be throttled by the API providers.
|
34
26
|
|
35
|
-
If something funny happens
|
36
|
-
|
37
|
-
If a http error happens (404, 500, etc), +nil+ will be returned.
|
27
|
+
If something funny happens (a 4XX or 5XX response is returned), nil will be returned.
|
38
28
|
|
39
|
-
|
40
29
|
== Note on Patches/Pull Requests
|
41
30
|
|
42
31
|
* Fork the project.
|
data/Rakefile
CHANGED
@@ -1,55 +1,46 @@
|
|
1
1
|
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
|
2
11
|
require 'rake'
|
3
12
|
|
4
|
-
|
5
|
-
|
6
|
-
|
13
|
+
require 'jeweler'
|
14
|
+
Jeweler::Tasks.new do |gem|
|
15
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
7
16
|
gem.name = "webtagger"
|
8
17
|
gem.summary = %Q{Use some popular web services to extract keywords from text}
|
9
18
|
gem.description = %Q{Use webtagger to use keyword extraction web services (yahoo, tagthe and alchemy) to extract from a text terms suitable for tagging, summarization, query building, etc.}
|
10
|
-
gem.email = "
|
19
|
+
gem.email = "luisfelipe@lfborjas.com"
|
11
20
|
gem.homepage = "http://github.com/lfborjas/webtagger"
|
12
21
|
gem.authors = ["lfborjas"]
|
13
|
-
gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
14
|
-
gem.add_dependency "httparty", "0.6.1"
|
15
|
-
gem.executables << 'webtagger'
|
16
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
-
end
|
18
|
-
Jeweler::GemcutterTasks.new
|
19
|
-
rescue LoadError
|
20
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
22
|
end
|
23
|
+
Jeweler::RubygemsDotOrgTasks.new
|
22
24
|
|
23
|
-
require '
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
test.verbose = true
|
25
|
+
require 'rspec/core'
|
26
|
+
require 'rspec/core/rake_task'
|
27
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
28
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
28
29
|
end
|
29
30
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
test.libs << 'test'
|
34
|
-
test.pattern = 'test/**/test_*.rb'
|
35
|
-
test.verbose = true
|
36
|
-
end
|
37
|
-
rescue LoadError
|
38
|
-
task :rcov do
|
39
|
-
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
40
|
-
end
|
31
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
32
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
33
|
+
spec.rcov = true
|
41
34
|
end
|
42
35
|
|
43
|
-
task :
|
44
|
-
|
45
|
-
task :default => :test
|
36
|
+
task :default => :spec
|
46
37
|
|
47
38
|
require 'rake/rdoctask'
|
48
39
|
Rake::RDocTask.new do |rdoc|
|
49
40
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
50
41
|
|
51
42
|
rdoc.rdoc_dir = 'rdoc'
|
52
|
-
rdoc.title = "
|
43
|
+
rdoc.title = "scriabin #{version}"
|
53
44
|
rdoc.rdoc_files.include('README*')
|
54
45
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
55
46
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/lib/webtagger.rb
CHANGED
@@ -1,133 +1,82 @@
|
|
1
|
-
require
|
2
|
-
require 'httparty'
|
3
|
-
require 'httparty_icebox'
|
1
|
+
%w{net/http json digest/md5}.each{|m| require m }
|
4
2
|
|
5
|
-
#
|
6
|
-
#Because the yahoo and alchemy services require an API key, a command line utility is provided
|
7
|
-
#to add those tokens for subsequent uses of the modules, storing them in <tt>~/.webtagger</tt>
|
3
|
+
#Class for extracting keywords from text. Uses the tagthe, yahoo and alchemyAPI web services.
|
8
4
|
#it uses caching to avoid being throttled by the apis, via the httparty_icebox gem
|
9
|
-
|
10
|
-
|
11
|
-
#The services supported by this version
|
12
|
-
SERVICES = ['yahoo', 'alchemy', 'tagthe']
|
13
|
-
|
14
|
-
#A generic exception to handle api call errors
|
15
|
-
class WebTaggerError < RuntimeError
|
16
|
-
attr :response
|
17
|
-
def initialize(resp)
|
18
|
-
@response = resp
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
#Get the persisted token for a service, if no service is provided, all tokens are returned in a hash
|
23
|
-
#Params:
|
24
|
-
#+service+:: the service for which the token should be retrieved, must be one of SERVICES
|
25
|
-
def get_token(service="")
|
26
|
-
service = service.strip.downcase
|
27
|
-
conf = File.join(ENV['HOME'], '.webtagger')
|
28
|
-
return nil unless File.exist? conf
|
29
|
-
srvcs = {}
|
30
|
-
File.open(conf).each do |service_conf|
|
31
|
-
s, t = service_conf.split(/\s*=\s*/) rescue next
|
32
|
-
srvcs[s.strip.downcase] = t.strip
|
33
|
-
end
|
5
|
+
class WebTagger
|
34
6
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
7
|
+
#one of these days, gotta add filesystem cache
|
8
|
+
@@cache = {}
|
9
|
+
#Macro for creating a provider-specific tagger
|
10
|
+
def self.tags_with(service, options={}, &callback)
|
11
|
+
opts = {:uri => "",
|
12
|
+
:use_tokens=>true,
|
13
|
+
:cache=>true,
|
14
|
+
:json=>true,
|
15
|
+
:method=>:post,
|
16
|
+
:text_param=>"text",
|
17
|
+
:token_param=>"",
|
18
|
+
:extra_params=>{} }.merge(options)
|
19
|
+
|
20
|
+
#use the meta-class to inject a static method in this class
|
21
|
+
(class << self; self; end).instance_eval do
|
22
|
+
|
23
|
+
#hack the block: using the star operator we can get an empty second param without fuss
|
24
|
+
define_method("tag_with_#{service.to_s}") do | text, *tokens |
|
25
|
+
|
26
|
+
text_digest = Digest::MD5.hexdigest service.to_s+text
|
27
|
+
callback.call(@@cache[text_digest]) unless @@cache[text_digest].nil?
|
28
|
+
|
29
|
+
query = {opts[:text_param] => text}.merge(opts[:extra_params])
|
30
|
+
query[opts[:token_param]] = *tokens if opts[:use_tokens]
|
31
|
+
|
32
|
+
r = Net::HTTP.post_form URI.parse(opts[:uri]), query
|
33
|
+
|
34
|
+
response = if opts[:json] then JSON.parse(r.body) else r.body end
|
35
|
+
if (100..399) === r.code.to_i
|
36
|
+
@@cache[text_digest] = response
|
37
|
+
callback.call(response)
|
38
|
+
else
|
39
|
+
callback.call(nil)
|
40
|
+
end
|
61
41
|
end
|
62
42
|
end
|
63
43
|
end
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
83
|
-
return kws
|
84
|
-
else
|
85
|
-
raise WebTaggerError.new(resp), "Error in API call"
|
86
|
-
end
|
87
|
-
end
|
44
|
+
|
45
|
+
Boilerplate = {:yahoo=>{:uri=>"http://search.yahooapis.com/ContentAnalysisService/V1/termExtraction",
|
46
|
+
:token_param=>"appid",
|
47
|
+
:text_param=>"context",
|
48
|
+
:extra_params=>{:output=>"json"}
|
49
|
+
},
|
50
|
+
:alchemy=>{
|
51
|
+
:uri => "http://access.alchemyapi.com/calls/text/TextGetRankedKeywords",
|
52
|
+
:token_param => "apikey",
|
53
|
+
:extra_params=>{:outputMode => "json"}
|
54
|
+
},
|
55
|
+
:tagthe=>{:uri=>"http://tagthe.net/api",
|
56
|
+
:extra_params=>{:view=>"json"}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
tags_with :yahoo, Boilerplate[:yahoo] do |r|
|
61
|
+
r['ResultSet']['ResultSet'] if r and r['ResultSet']
|
88
62
|
end
|
89
63
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
base_uri "http://tagthe.net/api"
|
97
|
-
cache :store => 'memory', :timeout => 1
|
98
|
-
|
99
|
-
def self.tag(text)
|
100
|
-
resp = post("/", :query => {:text => text, :view=>'json'} )
|
101
|
-
if resp.has_key?('memes') and resp['memes'][0].has_key?('dimensions') \
|
102
|
-
and resp['memes'][0]['dimensions'].has_key?('topic')
|
103
|
-
|
104
|
-
return resp['memes'][0]['dimensions']['topic']
|
105
|
-
else
|
106
|
-
return []
|
64
|
+
tags_with :alchemy, Boilerplate[:alchemy] do |resp|
|
65
|
+
if resp['status'] != 'ERROR'
|
66
|
+
#it's a hash array of [{:text=>"", :relevance=>""}]
|
67
|
+
kws = []
|
68
|
+
resp['keywords'].each do |m|
|
69
|
+
kws.push m["text"]
|
107
70
|
end
|
108
|
-
|
71
|
+
kws
|
72
|
+
end
|
109
73
|
end
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
#superseeds the one stored in +~/.webtagger+ and that, due to caching, might not be used if the request is done
|
117
|
-
#less than a minute after the last one with a different token
|
118
|
-
def tag(text,service="tagthe",token=nil)
|
119
|
-
service = service.strip.downcase
|
120
|
-
token = get_token(service) unless token
|
121
|
-
return case
|
122
|
-
when service == "yahoo"
|
123
|
-
Yahoo.tag(text, token)
|
124
|
-
when service == "alchemy"
|
125
|
-
Alchemy.tag(text, token)
|
126
|
-
else
|
127
|
-
Tagthe.tag(text)
|
74
|
+
|
75
|
+
tags_with :tagthe, Boilerplate[:tagthe] do |resp|
|
76
|
+
if resp.has_key?('memes') and resp['memes'][0].has_key?('dimensions') \
|
77
|
+
and resp['memes'][0]['dimensions'].has_key?('topic')
|
78
|
+
|
79
|
+
resp['memes'][0]['dimensions']['topic']
|
128
80
|
end
|
129
81
|
end
|
130
|
-
|
131
|
-
module_function :tag
|
132
|
-
module_function :get_token
|
133
82
|
end #of webtagger module
|
@@ -0,0 +1,12 @@
|
|
1
|
+
{
|
2
|
+
"status": "OK",
|
3
|
+
"usage": "By accessing AlchemyAPI or using information generated by AlchemyAPI, you are agreeing to be bound by the AlchemyAPI Terms of Use: http://www.alchemyapi.com/company/terms.html",
|
4
|
+
"url": "",
|
5
|
+
"language": "english",
|
6
|
+
"keywords": [
|
7
|
+
{
|
8
|
+
"text": "general surgeon",
|
9
|
+
"relevance": "0.989011"
|
10
|
+
}
|
11
|
+
]
|
12
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
{"memes":[{"source":"urn:memanage:4F85801E2FE923FF6A0DBBB1A606F1A7","updated":"Sat Jan 22 11:33:19 CET 2011","dimensions":{"topic":["surgeon"],"language":["english"]}}]}
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 'webtagger'
|
5
|
+
require 'fakeweb'
|
6
|
+
file_opener = lambda {|service| File.open("#{File.dirname(__FILE__)}/fixtures/#{service}.json").read}
|
7
|
+
|
8
|
+
FakeWeb.register_uri(:post, "http://tagthe.net/api", :body=>file_opener.call("tagthe"))
|
9
|
+
FakeWeb.register_uri(:post, "http://access.alchemyapi.com/calls/text/TextGetRankedKeywords",
|
10
|
+
:body=>file_opener.call("alchemy"))
|
11
|
+
|
12
|
+
# Requires supporting files with custom matchers and macros, etc,
|
13
|
+
# in ./support/ and its subdirectories.
|
14
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
15
|
+
|
16
|
+
RSpec.configure do |config|
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "WebTagger" do
|
4
|
+
before(:each) do
|
5
|
+
@query = "I'm a very general surgeon, surgeon"
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should tag with tagthe" do
|
9
|
+
r = WebTagger.tag_with_tagthe @query
|
10
|
+
r.should == ["surgeon"]
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should tag with alchemy" do
|
14
|
+
r = WebTagger.tag_with_alchemy @query
|
15
|
+
r.should == ["general surgeon"]
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
|
-
- 0
|
8
|
-
- 1
|
9
7
|
- 1
|
10
|
-
|
8
|
+
- 0
|
9
|
+
- 0
|
10
|
+
version: 1.0.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- lfborjas
|
@@ -15,13 +15,13 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-22 00:00:00 -06:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
-
name: thoughtbot-shoulda
|
23
22
|
prerelease: false
|
24
|
-
|
23
|
+
name: json
|
24
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
27
|
- - ">="
|
@@ -30,29 +30,90 @@ dependencies:
|
|
30
30
|
segments:
|
31
31
|
- 0
|
32
32
|
version: "0"
|
33
|
+
requirement: *id001
|
34
|
+
type: :runtime
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
prerelease: false
|
37
|
+
name: rspec
|
38
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 2
|
46
|
+
- 3
|
47
|
+
- 0
|
48
|
+
version: 2.3.0
|
49
|
+
requirement: *id002
|
33
50
|
type: :development
|
34
|
-
version_requirements: *id001
|
35
51
|
- !ruby/object:Gem::Dependency
|
36
|
-
name: httparty
|
37
52
|
prerelease: false
|
38
|
-
|
53
|
+
name: bundler
|
54
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
39
55
|
none: false
|
40
56
|
requirements:
|
41
|
-
- -
|
57
|
+
- - ~>
|
42
58
|
- !ruby/object:Gem::Version
|
43
|
-
hash:
|
59
|
+
hash: 23
|
44
60
|
segments:
|
61
|
+
- 1
|
62
|
+
- 0
|
45
63
|
- 0
|
46
|
-
|
64
|
+
version: 1.0.0
|
65
|
+
requirement: *id003
|
66
|
+
type: :development
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
prerelease: false
|
69
|
+
name: jeweler
|
70
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 7
|
76
|
+
segments:
|
47
77
|
- 1
|
48
|
-
|
49
|
-
|
50
|
-
|
78
|
+
- 5
|
79
|
+
- 2
|
80
|
+
version: 1.5.2
|
81
|
+
requirement: *id004
|
82
|
+
type: :development
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
prerelease: false
|
85
|
+
name: rcov
|
86
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
92
|
+
segments:
|
93
|
+
- 0
|
94
|
+
version: "0"
|
95
|
+
requirement: *id005
|
96
|
+
type: :development
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
prerelease: false
|
99
|
+
name: fakeweb
|
100
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - ~>
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
hash: 27
|
106
|
+
segments:
|
107
|
+
- 1
|
108
|
+
- 3
|
109
|
+
- 0
|
110
|
+
version: 1.3.0
|
111
|
+
requirement: *id006
|
112
|
+
type: :development
|
51
113
|
description: Use webtagger to use keyword extraction web services (yahoo, tagthe and alchemy) to extract from a text terms suitable for tagging, summarization, query building, etc.
|
52
|
-
email:
|
53
|
-
executables:
|
54
|
-
|
55
|
-
- webtagger
|
114
|
+
email: luisfelipe@lfborjas.com
|
115
|
+
executables: []
|
116
|
+
|
56
117
|
extensions: []
|
57
118
|
|
58
119
|
extra_rdoc_files:
|
@@ -60,24 +121,26 @@ extra_rdoc_files:
|
|
60
121
|
- README.rdoc
|
61
122
|
files:
|
62
123
|
- .document
|
63
|
-
-
|
124
|
+
- Gemfile
|
125
|
+
- Gemfile.lock
|
64
126
|
- LICENSE
|
65
127
|
- README.rdoc
|
66
128
|
- Rakefile
|
67
129
|
- VERSION
|
68
|
-
- bin/webtagger
|
69
|
-
- lib/httparty_icebox.rb
|
70
130
|
- lib/webtagger.rb
|
71
|
-
-
|
72
|
-
-
|
131
|
+
- spec/fixtures/alchemy.json
|
132
|
+
- spec/fixtures/tagthe.json
|
133
|
+
- spec/spec_helper.rb
|
134
|
+
- spec/support_spec.rb
|
135
|
+
- spec/webtagger_spec.rb
|
73
136
|
- webtagger.gemspec
|
74
137
|
has_rdoc: true
|
75
138
|
homepage: http://github.com/lfborjas/webtagger
|
76
139
|
licenses: []
|
77
140
|
|
78
141
|
post_install_message:
|
79
|
-
rdoc_options:
|
80
|
-
|
142
|
+
rdoc_options: []
|
143
|
+
|
81
144
|
require_paths:
|
82
145
|
- lib
|
83
146
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -106,5 +169,6 @@ signing_key:
|
|
106
169
|
specification_version: 3
|
107
170
|
summary: Use some popular web services to extract keywords from text
|
108
171
|
test_files:
|
109
|
-
-
|
110
|
-
-
|
172
|
+
- spec/spec_helper.rb
|
173
|
+
- spec/support_spec.rb
|
174
|
+
- spec/webtagger_spec.rb
|
data/.gitignore
DELETED
data/bin/webtagger
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'optparse'
|
3
|
-
require 'fileutils'
|
4
|
-
$:.unshift File.dirname(__FILE__) + "/../lib"
|
5
|
-
|
6
|
-
require 'webtagger'
|
7
|
-
|
8
|
-
service = ""
|
9
|
-
|
10
|
-
def configure
|
11
|
-
WebTagger::SERVICES.each do |service|
|
12
|
-
next if service == "tagthe"
|
13
|
-
conf = File.join(ENV['HOME'], '.webtagger')
|
14
|
-
FileUtils.touch(conf) unless File.exist? conf
|
15
|
-
srvcs = {}
|
16
|
-
File.open(conf).each do |service_conf|
|
17
|
-
s, t = service_conf.split(/\s*=\s*/) rescue next
|
18
|
-
srvcs[s.strip.downcase] = t ? t.strip : ""
|
19
|
-
end
|
20
|
-
puts "Token for #{service.downcase} (leave blank if you don't want to set it now or you already did): "
|
21
|
-
token = gets
|
22
|
-
srvcs[service]= (token and not token.strip.empty?) ? token : srvcs[service] || ""
|
23
|
-
File.open(conf,'w') do |new_conf|
|
24
|
-
srvcs.each do |s, t|
|
25
|
-
new_conf.write("#{s.upcase}=#{t.strip}\n")
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
OptionParser.new do |opt|
|
32
|
-
opt.banner = "usage: webtagger [OPTIONS] [text]"
|
33
|
-
opt.on('-c', '--configure', String, "Add tokens for each service") do
|
34
|
-
configure()
|
35
|
-
exit
|
36
|
-
end
|
37
|
-
|
38
|
-
opt.on('-t', '--token=[service]', String, "Get the token of a specific service (or all if not specified)") do |s|
|
39
|
-
s="all" if not s or s.empty?
|
40
|
-
puts WebTagger.get_token(s)
|
41
|
-
exit
|
42
|
-
end
|
43
|
-
opt.on('-s', '--service=[service]', String, "Tag the text with the specified service (defaults to tagthe)") do |s|
|
44
|
-
s="" unless WebTagger::SERVICES.include?(s)
|
45
|
-
service = s
|
46
|
-
end
|
47
|
-
opt.on('-h', '--help', "Display the help screen and exit") do
|
48
|
-
puts opt
|
49
|
-
exit
|
50
|
-
end
|
51
|
-
|
52
|
-
end.parse!
|
53
|
-
|
54
|
-
#do the actual tagging:
|
55
|
-
text = ARGV[0]
|
56
|
-
if text and not text.empty?
|
57
|
-
puts "tags: %s"%WebTagger.tag(text, service).inspect[1..-2] rescue puts "Couldn't extract tags"
|
58
|
-
else
|
59
|
-
puts "You must supply some text to tag!"
|
60
|
-
end
|
data/lib/httparty_icebox.rb
DELETED
@@ -1,263 +0,0 @@
|
|
1
|
-
# = Icebox : Caching for HTTParty
|
2
|
-
#
|
3
|
-
# Cache responses in HTTParty models [http://github.com/jnunemaker/httparty]
|
4
|
-
#
|
5
|
-
# === Usage
|
6
|
-
#
|
7
|
-
# class Foo
|
8
|
-
# include HTTParty
|
9
|
-
# include HTTParty::Icebox
|
10
|
-
# cache :store => 'file', :timeout => 600, :location => MY_APP_ROOT.join('tmp', 'cache')
|
11
|
-
# end
|
12
|
-
#
|
13
|
-
# Modeled after Martyn Loughran's APICache [http://github.com/newbamboo/api_cache]
|
14
|
-
# and Ruby On Rails's caching [http://api.rubyonrails.org/classes/ActiveSupport/Cache.html]
|
15
|
-
#
|
16
|
-
# Author: Karel Minarik [www.karmi.cz]
|
17
|
-
#
|
18
|
-
# === Notes
|
19
|
-
#
|
20
|
-
# Thanks to Amit Chakradeo for pointing out response objects have to be stored marhalled on FS
|
21
|
-
# Thanks to Marlin Forbes for pointing out the query parameters have to be included in the cache key
|
22
|
-
#
|
23
|
-
#
|
24
|
-
|
25
|
-
require 'logger'
|
26
|
-
require 'ftools'
|
27
|
-
require 'tmpdir'
|
28
|
-
require 'pathname'
|
29
|
-
require 'digest/md5'
|
30
|
-
|
31
|
-
module HTTParty #:nodoc:
|
32
|
-
# == Caching for HTTParty
|
33
|
-
# See documentation in HTTParty::Icebox::ClassMethods.cache
|
34
|
-
#
|
35
|
-
module Icebox
|
36
|
-
|
37
|
-
module ClassMethods
|
38
|
-
|
39
|
-
# Enable caching and set cache options
|
40
|
-
# Returns memoized cache object
|
41
|
-
#
|
42
|
-
# Following options are available, default values are in []:
|
43
|
-
#
|
44
|
-
# +store+:: Storage mechanism for cached data (memory, filesystem, your own) [memory]
|
45
|
-
# +timeout+:: Cache expiration in seconds [60]
|
46
|
-
# +logger+:: Path to logfile or logger instance [nil, silent]
|
47
|
-
#
|
48
|
-
# Any additional options are passed to the Cache constructor
|
49
|
-
#
|
50
|
-
# Usage:
|
51
|
-
#
|
52
|
-
# # Enable caching in HTTParty, in memory, for 1 minute
|
53
|
-
# cache # Use default values
|
54
|
-
#
|
55
|
-
# # Enable caching in HTTParty, on filesystem (/tmp), for 10 minutes
|
56
|
-
# cache :store => 'file', :timeout => 600, :location => '/tmp/'
|
57
|
-
#
|
58
|
-
# # Use your own cache store (see +AbstractStore+ class below)
|
59
|
-
# cache :store => 'memcached', :timeout => 600, :server => '192.168.1.1:1001'
|
60
|
-
#
|
61
|
-
def cache(options={})
|
62
|
-
options[:store] ||= 'memory'
|
63
|
-
options[:timeout] ||= 60
|
64
|
-
logger = options[:logger]
|
65
|
-
@cache ||= Cache.new( options.delete(:store), options )
|
66
|
-
end
|
67
|
-
|
68
|
-
end
|
69
|
-
|
70
|
-
# When included, extend class with +cache+ method
|
71
|
-
# and redefine +get+ method to use cache
|
72
|
-
#
|
73
|
-
def self.included(receiver) #:nodoc:
|
74
|
-
receiver.extend ClassMethods
|
75
|
-
receiver.class_eval do
|
76
|
-
|
77
|
-
# Get reponse from network
|
78
|
-
#
|
79
|
-
# TODO: Why alias :new :old is not working here? Returns NoMethodError
|
80
|
-
#
|
81
|
-
def self.get_without_caching(path, options={})
|
82
|
-
perform_request Net::HTTP::Get, path, options
|
83
|
-
end
|
84
|
-
|
85
|
-
# Get response from cache, if available
|
86
|
-
#
|
87
|
-
def self.get_with_caching(path, options={})
|
88
|
-
key = path
|
89
|
-
key << options[:query].to_s if defined? options[:query]
|
90
|
-
if cache.exists?(key) and not cache.stale?(key)
|
91
|
-
Cache.logger.debug "CACHE -- GET #{path}#{options[:query]}"
|
92
|
-
return cache.get(key)
|
93
|
-
else
|
94
|
-
Cache.logger.debug "/!\\ NETWORK -- GET #{path}#{options[:query]}"
|
95
|
-
response = get_without_caching(path, options)
|
96
|
-
cache.set(key, response) if response.code == 200
|
97
|
-
return response
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
# Redefine original HTTParty +get+ method to use cache
|
102
|
-
#
|
103
|
-
def self.get(path, options={})
|
104
|
-
self.get_with_caching(path, options={})
|
105
|
-
end
|
106
|
-
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
# === Cache container
|
111
|
-
#
|
112
|
-
# Pass a store name ('memory', etc) to new
|
113
|
-
#
|
114
|
-
class Cache
|
115
|
-
attr_accessor :store
|
116
|
-
|
117
|
-
def initialize(store, options={})
|
118
|
-
self.class.logger = options[:logger]
|
119
|
-
@store = self.class.lookup_store(store).new(options)
|
120
|
-
end
|
121
|
-
|
122
|
-
def get(key); @store.get encode(key) unless stale?(key); end
|
123
|
-
def set(key, value); @store.set encode(key), value; end
|
124
|
-
def exists?(key); @store.exists? encode(key); end
|
125
|
-
def stale?(key); @store.stale? encode(key); end
|
126
|
-
|
127
|
-
def self.logger; @logger || default_logger; end
|
128
|
-
def self.default_logger; logger = ::Logger.new(STDERR); end
|
129
|
-
|
130
|
-
# Pass a filename (String), IO object, Logger instance or +nil+ to silence the logger
|
131
|
-
def self.logger=(device); @logger = device.kind_of?(::Logger) ? device : ::Logger.new(device); end
|
132
|
-
|
133
|
-
private
|
134
|
-
|
135
|
-
# Return store class based on passed name
|
136
|
-
def self.lookup_store(name)
|
137
|
-
store_name = "#{name.capitalize}Store"
|
138
|
-
return Store::const_get(store_name)
|
139
|
-
rescue NameError => e
|
140
|
-
raise Store::StoreNotFound, "The cache store '#{store_name}' was not found. Did you loaded any such class?"
|
141
|
-
end
|
142
|
-
|
143
|
-
def encode(key); Digest::MD5.hexdigest(key); end
|
144
|
-
end
|
145
|
-
|
146
|
-
|
147
|
-
# === Cache stores
|
148
|
-
#
|
149
|
-
module Store
|
150
|
-
|
151
|
-
class StoreNotFound < StandardError; end #:nodoc:
|
152
|
-
|
153
|
-
# ==== Abstract Store
|
154
|
-
# Inherit your store from this class
|
155
|
-
# *IMPORTANT*: Do not forget to call +super+ in your +initialize+ method!
|
156
|
-
#
|
157
|
-
class AbstractStore
|
158
|
-
def initialize(options={})
|
159
|
-
raise ArgumentError, "You need to set the :timeout parameter" unless options[:timeout]
|
160
|
-
@timeout = options[:timeout]
|
161
|
-
message = "Cache: Using #{self.class.to_s.split('::').last}"
|
162
|
-
message << " in location: #{options[:location]}" if options[:location]
|
163
|
-
message << " with timeout #{options[:timeout]} sec"
|
164
|
-
Cache.logger.info message unless options[:logger].nil?
|
165
|
-
return self
|
166
|
-
end
|
167
|
-
%w{set get exists? stale?}.each do |method_name|
|
168
|
-
define_method(method_name) { raise NoMethodError, "Please implement method #{method_name} in your store class" }
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
# ==== Store objects in memory
|
173
|
-
# See HTTParty::Icebox::ClassMethods.cache
|
174
|
-
#
|
175
|
-
class MemoryStore < AbstractStore
|
176
|
-
def initialize(options={})
|
177
|
-
super; @store = {}; self
|
178
|
-
end
|
179
|
-
def set(key, value)
|
180
|
-
Cache.logger.info("Cache: set (#{key})")
|
181
|
-
@store[key] = [Time.now, value]; true
|
182
|
-
end
|
183
|
-
def get(key)
|
184
|
-
data = @store[key][1]
|
185
|
-
Cache.logger.info("Cache: #{data.nil? ? "miss" : "hit"} (#{key})")
|
186
|
-
data
|
187
|
-
end
|
188
|
-
def exists?(key)
|
189
|
-
!@store[key].nil?
|
190
|
-
end
|
191
|
-
def stale?(key)
|
192
|
-
return true unless exists?(key)
|
193
|
-
Time.now - created(key) > @timeout
|
194
|
-
end
|
195
|
-
private
|
196
|
-
def created(key)
|
197
|
-
@store[key][0]
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
# ==== Store objects on the filesystem
|
202
|
-
# See HTTParty::Icebox::ClassMethods.cache
|
203
|
-
#
|
204
|
-
class FileStore < AbstractStore
|
205
|
-
def initialize(options={})
|
206
|
-
super
|
207
|
-
options[:location] ||= Dir::tmpdir
|
208
|
-
@path = Pathname.new( options[:location] )
|
209
|
-
FileUtils.mkdir_p( @path )
|
210
|
-
self
|
211
|
-
end
|
212
|
-
def set(key, value)
|
213
|
-
Cache.logger.info("Cache: set (#{key})")
|
214
|
-
File.open( @path.join(key), 'w' ) { |file| file << Marshal.dump(value) }
|
215
|
-
true
|
216
|
-
end
|
217
|
-
def get(key)
|
218
|
-
data = Marshal.load(File.read( @path.join(key)))
|
219
|
-
Cache.logger.info("Cache: #{data.nil? ? "miss" : "hit"} (#{key})")
|
220
|
-
data
|
221
|
-
end
|
222
|
-
def exists?(key)
|
223
|
-
File.exists?( @path.join(key) )
|
224
|
-
end
|
225
|
-
def stale?(key)
|
226
|
-
return true unless exists?(key)
|
227
|
-
Time.now - created(key) > @timeout
|
228
|
-
end
|
229
|
-
private
|
230
|
-
def created(key)
|
231
|
-
File.mtime( @path.join(key) )
|
232
|
-
end
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
|
-
|
240
|
-
# Major parts of this code are based on architecture of ApiCache.
|
241
|
-
# Copyright (c) 2008 Martyn Loughran
|
242
|
-
#
|
243
|
-
# Other parts are inspired by the ActiveSupport::Cache in Ruby On Rails.
|
244
|
-
# Copyright (c) 2005-2009 David Heinemeier Hansson
|
245
|
-
#
|
246
|
-
# Permission is hereby granted, free of charge, to any person obtaining
|
247
|
-
# a copy of this software and associated documentation files (the
|
248
|
-
# "Software"), to deal in the Software without restriction, including
|
249
|
-
# without limitation the rights to use, copy, modify, merge, publish,
|
250
|
-
# distribute, sublicense, and/or sell copies of the Software, and to
|
251
|
-
# permit persons to whom the Software is furnished to do so, subject to
|
252
|
-
# the following conditions:
|
253
|
-
#
|
254
|
-
# The above copyright notice and this permission notice shall be
|
255
|
-
# included in all copies or substantial portions of the Software.
|
256
|
-
#
|
257
|
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
258
|
-
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
259
|
-
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
260
|
-
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
261
|
-
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
262
|
-
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
263
|
-
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/test/helper.rb
DELETED