webtagger 0.1.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +15 -0
- data/Gemfile.lock +32 -0
- data/README.rdoc +8 -19
- data/Rakefile +23 -32
- data/VERSION +1 -1
- data/lib/webtagger.rb +69 -120
- data/spec/fixtures/alchemy.json +12 -0
- data/spec/fixtures/tagthe.json +1 -0
- data/spec/spec_helper.rb +18 -0
- data/spec/support_spec.rb +1 -0
- data/spec/webtagger_spec.rb +18 -0
- metadata +93 -29
- data/.gitignore +0 -21
- data/bin/webtagger +0 -60
- data/lib/httparty_icebox.rb +0 -263
- data/test/helper.rb +0 -10
- data/test/test_webtagger.rb +0 -7
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
gem 'json'
|
9
|
+
group :development do
|
10
|
+
gem "rspec", "~> 2.3.0"
|
11
|
+
gem "bundler", "~> 1.0.0"
|
12
|
+
gem "jeweler", "~> 1.5.2"
|
13
|
+
gem "rcov", ">= 0"
|
14
|
+
gem "fakeweb", "~> 1.3.0"
|
15
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.2)
|
5
|
+
fakeweb (1.3.0)
|
6
|
+
git (1.2.5)
|
7
|
+
jeweler (1.5.2)
|
8
|
+
bundler (~> 1.0.0)
|
9
|
+
git (>= 1.2.5)
|
10
|
+
rake
|
11
|
+
json (1.4.6)
|
12
|
+
rake (0.8.7)
|
13
|
+
rcov (0.9.9)
|
14
|
+
rspec (2.3.0)
|
15
|
+
rspec-core (~> 2.3.0)
|
16
|
+
rspec-expectations (~> 2.3.0)
|
17
|
+
rspec-mocks (~> 2.3.0)
|
18
|
+
rspec-core (2.3.1)
|
19
|
+
rspec-expectations (2.3.0)
|
20
|
+
diff-lcs (~> 1.1.2)
|
21
|
+
rspec-mocks (2.3.0)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
bundler (~> 1.0.0)
|
28
|
+
fakeweb (~> 1.3.0)
|
29
|
+
jeweler (~> 1.5.2)
|
30
|
+
json
|
31
|
+
rcov
|
32
|
+
rspec (~> 2.3.0)
|
data/README.rdoc
CHANGED
@@ -2,10 +2,9 @@
|
|
2
2
|
|
3
3
|
Webtagger is a simple ruby gem that uses the web intelligence to extract important terms in texts, suitable for tagging them, finding the main subject or automatically building queries.
|
4
4
|
|
5
|
-
It depends on {httparty}[http://github.com/jnunemaker/httparty] and uses the following external APIs:
|
6
5
|
* {Yahoo term extraction}[http://developer.yahoo.com/search/content/V1/termExtraction.html]
|
7
|
-
* {Tag-the-net}[http://tagthe.net]
|
8
|
-
* {Alchemy API}[http://www.alchemyapi.com/api/keyword/textc.html]
|
6
|
+
* {Tag-the-net}[http://tagthe.net] (Needs and API key!)
|
7
|
+
* {Alchemy API}[http://www.alchemyapi.com/api/keyword/textc.html] (Needs an API key!)
|
9
8
|
|
10
9
|
And it's written to support any API in the future.
|
11
10
|
|
@@ -14,29 +13,19 @@ And it's written to support any API in the future.
|
|
14
13
|
|
15
14
|
==Usage
|
16
15
|
|
17
|
-
Ok, little caveat here, you might need an API-key for some of the services, so you might want to run
|
18
|
-
webtagger --configure
|
19
|
-
|
20
|
-
To set or update your API keys
|
21
|
-
Or, you can pass them in the tagging method, like this
|
22
|
-
tags = WebTagger.tag(text, "yahoo", "YOUR-API-KEY")
|
23
|
-
|
24
16
|
Besides that pickle, the standard usage is really simple:
|
25
17
|
require 'webtagger'
|
26
18
|
text = "Hi, I'm text"
|
27
|
-
#you
|
28
|
-
tags = WebTagger.
|
29
|
-
#
|
30
|
-
|
31
|
-
|
19
|
+
#you simply call the appropriate method:
|
20
|
+
tags = WebTagger.tag_with_tagthe(text)
|
21
|
+
#some APIs might need an api key, pass that as the second parameter
|
22
|
+
tags = WebTagger.tag_with_yahoo(text, "YOUR-API-KEY")
|
23
|
+
|
32
24
|
|
33
25
|
WebTagger uses caching so rest assured you won't be throttled by the API providers.
|
34
26
|
|
35
|
-
If something funny happens
|
36
|
-
|
37
|
-
If a http error happens (404, 500, etc), +nil+ will be returned.
|
27
|
+
If something funny happens (a 4XX or 5XX response is returned), nil will be returned.
|
38
28
|
|
39
|
-
|
40
29
|
== Note on Patches/Pull Requests
|
41
30
|
|
42
31
|
* Fork the project.
|
data/Rakefile
CHANGED
@@ -1,55 +1,46 @@
|
|
1
1
|
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
|
2
11
|
require 'rake'
|
3
12
|
|
4
|
-
|
5
|
-
|
6
|
-
|
13
|
+
require 'jeweler'
|
14
|
+
Jeweler::Tasks.new do |gem|
|
15
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
7
16
|
gem.name = "webtagger"
|
8
17
|
gem.summary = %Q{Use some popular web services to extract keywords from text}
|
9
18
|
gem.description = %Q{Use webtagger to use keyword extraction web services (yahoo, tagthe and alchemy) to extract from a text terms suitable for tagging, summarization, query building, etc.}
|
10
|
-
gem.email = "
|
19
|
+
gem.email = "luisfelipe@lfborjas.com"
|
11
20
|
gem.homepage = "http://github.com/lfborjas/webtagger"
|
12
21
|
gem.authors = ["lfborjas"]
|
13
|
-
gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
|
14
|
-
gem.add_dependency "httparty", "0.6.1"
|
15
|
-
gem.executables << 'webtagger'
|
16
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
-
end
|
18
|
-
Jeweler::GemcutterTasks.new
|
19
|
-
rescue LoadError
|
20
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
22
|
end
|
23
|
+
Jeweler::RubygemsDotOrgTasks.new
|
22
24
|
|
23
|
-
require '
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
test.verbose = true
|
25
|
+
require 'rspec/core'
|
26
|
+
require 'rspec/core/rake_task'
|
27
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
28
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
28
29
|
end
|
29
30
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
test.libs << 'test'
|
34
|
-
test.pattern = 'test/**/test_*.rb'
|
35
|
-
test.verbose = true
|
36
|
-
end
|
37
|
-
rescue LoadError
|
38
|
-
task :rcov do
|
39
|
-
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
40
|
-
end
|
31
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
32
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
33
|
+
spec.rcov = true
|
41
34
|
end
|
42
35
|
|
43
|
-
task :
|
44
|
-
|
45
|
-
task :default => :test
|
36
|
+
task :default => :spec
|
46
37
|
|
47
38
|
require 'rake/rdoctask'
|
48
39
|
Rake::RDocTask.new do |rdoc|
|
49
40
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
50
41
|
|
51
42
|
rdoc.rdoc_dir = 'rdoc'
|
52
|
-
rdoc.title = "
|
43
|
+
rdoc.title = "scriabin #{version}"
|
53
44
|
rdoc.rdoc_files.include('README*')
|
54
45
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
55
46
|
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
1.0.0
|
data/lib/webtagger.rb
CHANGED
@@ -1,133 +1,82 @@
|
|
1
|
-
require
|
2
|
-
require 'httparty'
|
3
|
-
require 'httparty_icebox'
|
1
|
+
%w{net/http json digest/md5}.each{|m| require m }
|
4
2
|
|
5
|
-
#
|
6
|
-
#Because the yahoo and alchemy services require an API key, a command line utility is provided
|
7
|
-
#to add those tokens for subsequent uses of the modules, storing them in <tt>~/.webtagger</tt>
|
3
|
+
#Class for extracting keywords from text. Uses the tagthe, yahoo and alchemyAPI web services.
|
8
4
|
#it uses caching to avoid being throttled by the apis, via the httparty_icebox gem
|
9
|
-
|
10
|
-
|
11
|
-
#The services supported by this version
|
12
|
-
SERVICES = ['yahoo', 'alchemy', 'tagthe']
|
13
|
-
|
14
|
-
#A generic exception to handle api call errors
|
15
|
-
class WebTaggerError < RuntimeError
|
16
|
-
attr :response
|
17
|
-
def initialize(resp)
|
18
|
-
@response = resp
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
#Get the persisted token for a service, if no service is provided, all tokens are returned in a hash
|
23
|
-
#Params:
|
24
|
-
#+service+:: the service for which the token should be retrieved, must be one of SERVICES
|
25
|
-
def get_token(service="")
|
26
|
-
service = service.strip.downcase
|
27
|
-
conf = File.join(ENV['HOME'], '.webtagger')
|
28
|
-
return nil unless File.exist? conf
|
29
|
-
srvcs = {}
|
30
|
-
File.open(conf).each do |service_conf|
|
31
|
-
s, t = service_conf.split(/\s*=\s*/) rescue next
|
32
|
-
srvcs[s.strip.downcase] = t.strip
|
33
|
-
end
|
5
|
+
class WebTagger
|
34
6
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
7
|
+
#one of these days, gotta add filesystem cache
|
8
|
+
@@cache = {}
|
9
|
+
#Macro for creating a provider-specific tagger
|
10
|
+
def self.tags_with(service, options={}, &callback)
|
11
|
+
opts = {:uri => "",
|
12
|
+
:use_tokens=>true,
|
13
|
+
:cache=>true,
|
14
|
+
:json=>true,
|
15
|
+
:method=>:post,
|
16
|
+
:text_param=>"text",
|
17
|
+
:token_param=>"",
|
18
|
+
:extra_params=>{} }.merge(options)
|
19
|
+
|
20
|
+
#use the meta-class to inject a static method in this class
|
21
|
+
(class << self; self; end).instance_eval do
|
22
|
+
|
23
|
+
#hack the block: using the star operator we can get an empty second param without fuss
|
24
|
+
define_method("tag_with_#{service.to_s}") do | text, *tokens |
|
25
|
+
|
26
|
+
text_digest = Digest::MD5.hexdigest service.to_s+text
|
27
|
+
callback.call(@@cache[text_digest]) unless @@cache[text_digest].nil?
|
28
|
+
|
29
|
+
query = {opts[:text_param] => text}.merge(opts[:extra_params])
|
30
|
+
query[opts[:token_param]] = *tokens if opts[:use_tokens]
|
31
|
+
|
32
|
+
r = Net::HTTP.post_form URI.parse(opts[:uri]), query
|
33
|
+
|
34
|
+
response = if opts[:json] then JSON.parse(r.body) else r.body end
|
35
|
+
if (100..399) === r.code.to_i
|
36
|
+
@@cache[text_digest] = response
|
37
|
+
callback.call(response)
|
38
|
+
else
|
39
|
+
callback.call(nil)
|
40
|
+
end
|
61
41
|
end
|
62
42
|
end
|
63
43
|
end
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
end
|
83
|
-
return kws
|
84
|
-
else
|
85
|
-
raise WebTaggerError.new(resp), "Error in API call"
|
86
|
-
end
|
87
|
-
end
|
44
|
+
|
45
|
+
Boilerplate = {:yahoo=>{:uri=>"http://search.yahooapis.com/ContentAnalysisService/V1/termExtraction",
|
46
|
+
:token_param=>"appid",
|
47
|
+
:text_param=>"context",
|
48
|
+
:extra_params=>{:output=>"json"}
|
49
|
+
},
|
50
|
+
:alchemy=>{
|
51
|
+
:uri => "http://access.alchemyapi.com/calls/text/TextGetRankedKeywords",
|
52
|
+
:token_param => "apikey",
|
53
|
+
:extra_params=>{:outputMode => "json"}
|
54
|
+
},
|
55
|
+
:tagthe=>{:uri=>"http://tagthe.net/api",
|
56
|
+
:extra_params=>{:view=>"json"}
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
tags_with :yahoo, Boilerplate[:yahoo] do |r|
|
61
|
+
r['ResultSet']['ResultSet'] if r and r['ResultSet']
|
88
62
|
end
|
89
63
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
base_uri "http://tagthe.net/api"
|
97
|
-
cache :store => 'memory', :timeout => 1
|
98
|
-
|
99
|
-
def self.tag(text)
|
100
|
-
resp = post("/", :query => {:text => text, :view=>'json'} )
|
101
|
-
if resp.has_key?('memes') and resp['memes'][0].has_key?('dimensions') \
|
102
|
-
and resp['memes'][0]['dimensions'].has_key?('topic')
|
103
|
-
|
104
|
-
return resp['memes'][0]['dimensions']['topic']
|
105
|
-
else
|
106
|
-
return []
|
64
|
+
tags_with :alchemy, Boilerplate[:alchemy] do |resp|
|
65
|
+
if resp['status'] != 'ERROR'
|
66
|
+
#it's a hash array of [{:text=>"", :relevance=>""}]
|
67
|
+
kws = []
|
68
|
+
resp['keywords'].each do |m|
|
69
|
+
kws.push m["text"]
|
107
70
|
end
|
108
|
-
|
71
|
+
kws
|
72
|
+
end
|
109
73
|
end
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
#superseeds the one stored in +~/.webtagger+ and that, due to caching, might not be used if the request is done
|
117
|
-
#less than a minute after the last one with a different token
|
118
|
-
def tag(text,service="tagthe",token=nil)
|
119
|
-
service = service.strip.downcase
|
120
|
-
token = get_token(service) unless token
|
121
|
-
return case
|
122
|
-
when service == "yahoo"
|
123
|
-
Yahoo.tag(text, token)
|
124
|
-
when service == "alchemy"
|
125
|
-
Alchemy.tag(text, token)
|
126
|
-
else
|
127
|
-
Tagthe.tag(text)
|
74
|
+
|
75
|
+
tags_with :tagthe, Boilerplate[:tagthe] do |resp|
|
76
|
+
if resp.has_key?('memes') and resp['memes'][0].has_key?('dimensions') \
|
77
|
+
and resp['memes'][0]['dimensions'].has_key?('topic')
|
78
|
+
|
79
|
+
resp['memes'][0]['dimensions']['topic']
|
128
80
|
end
|
129
81
|
end
|
130
|
-
|
131
|
-
module_function :tag
|
132
|
-
module_function :get_token
|
133
82
|
end #of webtagger module
|
@@ -0,0 +1,12 @@
|
|
1
|
+
{
|
2
|
+
"status": "OK",
|
3
|
+
"usage": "By accessing AlchemyAPI or using information generated by AlchemyAPI, you are agreeing to be bound by the AlchemyAPI Terms of Use: http://www.alchemyapi.com/company/terms.html",
|
4
|
+
"url": "",
|
5
|
+
"language": "english",
|
6
|
+
"keywords": [
|
7
|
+
{
|
8
|
+
"text": "general surgeon",
|
9
|
+
"relevance": "0.989011"
|
10
|
+
}
|
11
|
+
]
|
12
|
+
}
|
@@ -0,0 +1 @@
|
|
1
|
+
{"memes":[{"source":"urn:memanage:4F85801E2FE923FF6A0DBBB1A606F1A7","updated":"Sat Jan 22 11:33:19 CET 2011","dimensions":{"topic":["surgeon"],"language":["english"]}}]}
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 'webtagger'
|
5
|
+
require 'fakeweb'
|
6
|
+
file_opener = lambda {|service| File.open("#{File.dirname(__FILE__)}/fixtures/#{service}.json").read}
|
7
|
+
|
8
|
+
FakeWeb.register_uri(:post, "http://tagthe.net/api", :body=>file_opener.call("tagthe"))
|
9
|
+
FakeWeb.register_uri(:post, "http://access.alchemyapi.com/calls/text/TextGetRankedKeywords",
|
10
|
+
:body=>file_opener.call("alchemy"))
|
11
|
+
|
12
|
+
# Requires supporting files with custom matchers and macros, etc,
|
13
|
+
# in ./support/ and its subdirectories.
|
14
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
15
|
+
|
16
|
+
RSpec.configure do |config|
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "WebTagger" do
|
4
|
+
before(:each) do
|
5
|
+
@query = "I'm a very general surgeon, surgeon"
|
6
|
+
end
|
7
|
+
|
8
|
+
it "should tag with tagthe" do
|
9
|
+
r = WebTagger.tag_with_tagthe @query
|
10
|
+
r.should == ["surgeon"]
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should tag with alchemy" do
|
14
|
+
r = WebTagger.tag_with_alchemy @query
|
15
|
+
r.should == ["general surgeon"]
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 23
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
|
-
- 0
|
8
|
-
- 1
|
9
7
|
- 1
|
10
|
-
|
8
|
+
- 0
|
9
|
+
- 0
|
10
|
+
version: 1.0.0
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- lfborjas
|
@@ -15,13 +15,13 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-22 00:00:00 -06:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
22
|
-
name: thoughtbot-shoulda
|
23
22
|
prerelease: false
|
24
|
-
|
23
|
+
name: json
|
24
|
+
version_requirements: &id001 !ruby/object:Gem::Requirement
|
25
25
|
none: false
|
26
26
|
requirements:
|
27
27
|
- - ">="
|
@@ -30,29 +30,90 @@ dependencies:
|
|
30
30
|
segments:
|
31
31
|
- 0
|
32
32
|
version: "0"
|
33
|
+
requirement: *id001
|
34
|
+
type: :runtime
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
prerelease: false
|
37
|
+
name: rspec
|
38
|
+
version_requirements: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 2
|
46
|
+
- 3
|
47
|
+
- 0
|
48
|
+
version: 2.3.0
|
49
|
+
requirement: *id002
|
33
50
|
type: :development
|
34
|
-
version_requirements: *id001
|
35
51
|
- !ruby/object:Gem::Dependency
|
36
|
-
name: httparty
|
37
52
|
prerelease: false
|
38
|
-
|
53
|
+
name: bundler
|
54
|
+
version_requirements: &id003 !ruby/object:Gem::Requirement
|
39
55
|
none: false
|
40
56
|
requirements:
|
41
|
-
- -
|
57
|
+
- - ~>
|
42
58
|
- !ruby/object:Gem::Version
|
43
|
-
hash:
|
59
|
+
hash: 23
|
44
60
|
segments:
|
61
|
+
- 1
|
62
|
+
- 0
|
45
63
|
- 0
|
46
|
-
|
64
|
+
version: 1.0.0
|
65
|
+
requirement: *id003
|
66
|
+
type: :development
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
prerelease: false
|
69
|
+
name: jeweler
|
70
|
+
version_requirements: &id004 !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ~>
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 7
|
76
|
+
segments:
|
47
77
|
- 1
|
48
|
-
|
49
|
-
|
50
|
-
|
78
|
+
- 5
|
79
|
+
- 2
|
80
|
+
version: 1.5.2
|
81
|
+
requirement: *id004
|
82
|
+
type: :development
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
prerelease: false
|
85
|
+
name: rcov
|
86
|
+
version_requirements: &id005 !ruby/object:Gem::Requirement
|
87
|
+
none: false
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
hash: 3
|
92
|
+
segments:
|
93
|
+
- 0
|
94
|
+
version: "0"
|
95
|
+
requirement: *id005
|
96
|
+
type: :development
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
prerelease: false
|
99
|
+
name: fakeweb
|
100
|
+
version_requirements: &id006 !ruby/object:Gem::Requirement
|
101
|
+
none: false
|
102
|
+
requirements:
|
103
|
+
- - ~>
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
hash: 27
|
106
|
+
segments:
|
107
|
+
- 1
|
108
|
+
- 3
|
109
|
+
- 0
|
110
|
+
version: 1.3.0
|
111
|
+
requirement: *id006
|
112
|
+
type: :development
|
51
113
|
description: Use webtagger to use keyword extraction web services (yahoo, tagthe and alchemy) to extract from a text terms suitable for tagging, summarization, query building, etc.
|
52
|
-
email:
|
53
|
-
executables:
|
54
|
-
|
55
|
-
- webtagger
|
114
|
+
email: luisfelipe@lfborjas.com
|
115
|
+
executables: []
|
116
|
+
|
56
117
|
extensions: []
|
57
118
|
|
58
119
|
extra_rdoc_files:
|
@@ -60,24 +121,26 @@ extra_rdoc_files:
|
|
60
121
|
- README.rdoc
|
61
122
|
files:
|
62
123
|
- .document
|
63
|
-
-
|
124
|
+
- Gemfile
|
125
|
+
- Gemfile.lock
|
64
126
|
- LICENSE
|
65
127
|
- README.rdoc
|
66
128
|
- Rakefile
|
67
129
|
- VERSION
|
68
|
-
- bin/webtagger
|
69
|
-
- lib/httparty_icebox.rb
|
70
130
|
- lib/webtagger.rb
|
71
|
-
-
|
72
|
-
-
|
131
|
+
- spec/fixtures/alchemy.json
|
132
|
+
- spec/fixtures/tagthe.json
|
133
|
+
- spec/spec_helper.rb
|
134
|
+
- spec/support_spec.rb
|
135
|
+
- spec/webtagger_spec.rb
|
73
136
|
- webtagger.gemspec
|
74
137
|
has_rdoc: true
|
75
138
|
homepage: http://github.com/lfborjas/webtagger
|
76
139
|
licenses: []
|
77
140
|
|
78
141
|
post_install_message:
|
79
|
-
rdoc_options:
|
80
|
-
|
142
|
+
rdoc_options: []
|
143
|
+
|
81
144
|
require_paths:
|
82
145
|
- lib
|
83
146
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -106,5 +169,6 @@ signing_key:
|
|
106
169
|
specification_version: 3
|
107
170
|
summary: Use some popular web services to extract keywords from text
|
108
171
|
test_files:
|
109
|
-
-
|
110
|
-
-
|
172
|
+
- spec/spec_helper.rb
|
173
|
+
- spec/support_spec.rb
|
174
|
+
- spec/webtagger_spec.rb
|
data/.gitignore
DELETED
data/bin/webtagger
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'optparse'
|
3
|
-
require 'fileutils'
|
4
|
-
$:.unshift File.dirname(__FILE__) + "/../lib"
|
5
|
-
|
6
|
-
require 'webtagger'
|
7
|
-
|
8
|
-
service = ""
|
9
|
-
|
10
|
-
def configure
|
11
|
-
WebTagger::SERVICES.each do |service|
|
12
|
-
next if service == "tagthe"
|
13
|
-
conf = File.join(ENV['HOME'], '.webtagger')
|
14
|
-
FileUtils.touch(conf) unless File.exist? conf
|
15
|
-
srvcs = {}
|
16
|
-
File.open(conf).each do |service_conf|
|
17
|
-
s, t = service_conf.split(/\s*=\s*/) rescue next
|
18
|
-
srvcs[s.strip.downcase] = t ? t.strip : ""
|
19
|
-
end
|
20
|
-
puts "Token for #{service.downcase} (leave blank if you don't want to set it now or you already did): "
|
21
|
-
token = gets
|
22
|
-
srvcs[service]= (token and not token.strip.empty?) ? token : srvcs[service] || ""
|
23
|
-
File.open(conf,'w') do |new_conf|
|
24
|
-
srvcs.each do |s, t|
|
25
|
-
new_conf.write("#{s.upcase}=#{t.strip}\n")
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
OptionParser.new do |opt|
|
32
|
-
opt.banner = "usage: webtagger [OPTIONS] [text]"
|
33
|
-
opt.on('-c', '--configure', String, "Add tokens for each service") do
|
34
|
-
configure()
|
35
|
-
exit
|
36
|
-
end
|
37
|
-
|
38
|
-
opt.on('-t', '--token=[service]', String, "Get the token of a specific service (or all if not specified)") do |s|
|
39
|
-
s="all" if not s or s.empty?
|
40
|
-
puts WebTagger.get_token(s)
|
41
|
-
exit
|
42
|
-
end
|
43
|
-
opt.on('-s', '--service=[service]', String, "Tag the text with the specified service (defaults to tagthe)") do |s|
|
44
|
-
s="" unless WebTagger::SERVICES.include?(s)
|
45
|
-
service = s
|
46
|
-
end
|
47
|
-
opt.on('-h', '--help', "Display the help screen and exit") do
|
48
|
-
puts opt
|
49
|
-
exit
|
50
|
-
end
|
51
|
-
|
52
|
-
end.parse!
|
53
|
-
|
54
|
-
#do the actual tagging:
|
55
|
-
text = ARGV[0]
|
56
|
-
if text and not text.empty?
|
57
|
-
puts "tags: %s"%WebTagger.tag(text, service).inspect[1..-2] rescue puts "Couldn't extract tags"
|
58
|
-
else
|
59
|
-
puts "You must supply some text to tag!"
|
60
|
-
end
|
data/lib/httparty_icebox.rb
DELETED
@@ -1,263 +0,0 @@
|
|
1
|
-
# = Icebox : Caching for HTTParty
|
2
|
-
#
|
3
|
-
# Cache responses in HTTParty models [http://github.com/jnunemaker/httparty]
|
4
|
-
#
|
5
|
-
# === Usage
|
6
|
-
#
|
7
|
-
# class Foo
|
8
|
-
# include HTTParty
|
9
|
-
# include HTTParty::Icebox
|
10
|
-
# cache :store => 'file', :timeout => 600, :location => MY_APP_ROOT.join('tmp', 'cache')
|
11
|
-
# end
|
12
|
-
#
|
13
|
-
# Modeled after Martyn Loughran's APICache [http://github.com/newbamboo/api_cache]
|
14
|
-
# and Ruby On Rails's caching [http://api.rubyonrails.org/classes/ActiveSupport/Cache.html]
|
15
|
-
#
|
16
|
-
# Author: Karel Minarik [www.karmi.cz]
|
17
|
-
#
|
18
|
-
# === Notes
|
19
|
-
#
|
20
|
-
# Thanks to Amit Chakradeo for pointing out response objects have to be stored marhalled on FS
|
21
|
-
# Thanks to Marlin Forbes for pointing out the query parameters have to be included in the cache key
|
22
|
-
#
|
23
|
-
#
|
24
|
-
|
25
|
-
require 'logger'
|
26
|
-
require 'ftools'
|
27
|
-
require 'tmpdir'
|
28
|
-
require 'pathname'
|
29
|
-
require 'digest/md5'
|
30
|
-
|
31
|
-
module HTTParty #:nodoc:
|
32
|
-
# == Caching for HTTParty
|
33
|
-
# See documentation in HTTParty::Icebox::ClassMethods.cache
|
34
|
-
#
|
35
|
-
module Icebox
|
36
|
-
|
37
|
-
module ClassMethods
|
38
|
-
|
39
|
-
# Enable caching and set cache options
|
40
|
-
# Returns memoized cache object
|
41
|
-
#
|
42
|
-
# Following options are available, default values are in []:
|
43
|
-
#
|
44
|
-
# +store+:: Storage mechanism for cached data (memory, filesystem, your own) [memory]
|
45
|
-
# +timeout+:: Cache expiration in seconds [60]
|
46
|
-
# +logger+:: Path to logfile or logger instance [nil, silent]
|
47
|
-
#
|
48
|
-
# Any additional options are passed to the Cache constructor
|
49
|
-
#
|
50
|
-
# Usage:
|
51
|
-
#
|
52
|
-
# # Enable caching in HTTParty, in memory, for 1 minute
|
53
|
-
# cache # Use default values
|
54
|
-
#
|
55
|
-
# # Enable caching in HTTParty, on filesystem (/tmp), for 10 minutes
|
56
|
-
# cache :store => 'file', :timeout => 600, :location => '/tmp/'
|
57
|
-
#
|
58
|
-
# # Use your own cache store (see +AbstractStore+ class below)
|
59
|
-
# cache :store => 'memcached', :timeout => 600, :server => '192.168.1.1:1001'
|
60
|
-
#
|
61
|
-
def cache(options={})
|
62
|
-
options[:store] ||= 'memory'
|
63
|
-
options[:timeout] ||= 60
|
64
|
-
logger = options[:logger]
|
65
|
-
@cache ||= Cache.new( options.delete(:store), options )
|
66
|
-
end
|
67
|
-
|
68
|
-
end
|
69
|
-
|
70
|
-
# When included, extend class with +cache+ method
|
71
|
-
# and redefine +get+ method to use cache
|
72
|
-
#
|
73
|
-
def self.included(receiver) #:nodoc:
|
74
|
-
receiver.extend ClassMethods
|
75
|
-
receiver.class_eval do
|
76
|
-
|
77
|
-
# Get reponse from network
|
78
|
-
#
|
79
|
-
# TODO: Why alias :new :old is not working here? Returns NoMethodError
|
80
|
-
#
|
81
|
-
def self.get_without_caching(path, options={})
|
82
|
-
perform_request Net::HTTP::Get, path, options
|
83
|
-
end
|
84
|
-
|
85
|
-
# Get response from cache, if available
|
86
|
-
#
|
87
|
-
def self.get_with_caching(path, options={})
|
88
|
-
key = path
|
89
|
-
key << options[:query].to_s if defined? options[:query]
|
90
|
-
if cache.exists?(key) and not cache.stale?(key)
|
91
|
-
Cache.logger.debug "CACHE -- GET #{path}#{options[:query]}"
|
92
|
-
return cache.get(key)
|
93
|
-
else
|
94
|
-
Cache.logger.debug "/!\\ NETWORK -- GET #{path}#{options[:query]}"
|
95
|
-
response = get_without_caching(path, options)
|
96
|
-
cache.set(key, response) if response.code == 200
|
97
|
-
return response
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
# Redefine original HTTParty +get+ method to use cache
|
102
|
-
#
|
103
|
-
def self.get(path, options={})
|
104
|
-
self.get_with_caching(path, options={})
|
105
|
-
end
|
106
|
-
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
# === Cache container
|
111
|
-
#
|
112
|
-
# Pass a store name ('memory', etc) to new
|
113
|
-
#
|
114
|
-
class Cache
|
115
|
-
attr_accessor :store
|
116
|
-
|
117
|
-
def initialize(store, options={})
|
118
|
-
self.class.logger = options[:logger]
|
119
|
-
@store = self.class.lookup_store(store).new(options)
|
120
|
-
end
|
121
|
-
|
122
|
-
def get(key); @store.get encode(key) unless stale?(key); end
|
123
|
-
def set(key, value); @store.set encode(key), value; end
|
124
|
-
def exists?(key); @store.exists? encode(key); end
|
125
|
-
def stale?(key); @store.stale? encode(key); end
|
126
|
-
|
127
|
-
def self.logger; @logger || default_logger; end
|
128
|
-
def self.default_logger; logger = ::Logger.new(STDERR); end
|
129
|
-
|
130
|
-
# Pass a filename (String), IO object, Logger instance or +nil+ to silence the logger
|
131
|
-
def self.logger=(device); @logger = device.kind_of?(::Logger) ? device : ::Logger.new(device); end
|
132
|
-
|
133
|
-
private
|
134
|
-
|
135
|
-
# Return store class based on passed name
|
136
|
-
def self.lookup_store(name)
|
137
|
-
store_name = "#{name.capitalize}Store"
|
138
|
-
return Store::const_get(store_name)
|
139
|
-
rescue NameError => e
|
140
|
-
raise Store::StoreNotFound, "The cache store '#{store_name}' was not found. Did you loaded any such class?"
|
141
|
-
end
|
142
|
-
|
143
|
-
def encode(key); Digest::MD5.hexdigest(key); end
|
144
|
-
end
|
145
|
-
|
146
|
-
|
147
|
-
# === Cache stores
|
148
|
-
#
|
149
|
-
module Store
|
150
|
-
|
151
|
-
class StoreNotFound < StandardError; end #:nodoc:
|
152
|
-
|
153
|
-
# ==== Abstract Store
|
154
|
-
# Inherit your store from this class
|
155
|
-
# *IMPORTANT*: Do not forget to call +super+ in your +initialize+ method!
|
156
|
-
#
|
157
|
-
class AbstractStore
|
158
|
-
def initialize(options={})
|
159
|
-
raise ArgumentError, "You need to set the :timeout parameter" unless options[:timeout]
|
160
|
-
@timeout = options[:timeout]
|
161
|
-
message = "Cache: Using #{self.class.to_s.split('::').last}"
|
162
|
-
message << " in location: #{options[:location]}" if options[:location]
|
163
|
-
message << " with timeout #{options[:timeout]} sec"
|
164
|
-
Cache.logger.info message unless options[:logger].nil?
|
165
|
-
return self
|
166
|
-
end
|
167
|
-
%w{set get exists? stale?}.each do |method_name|
|
168
|
-
define_method(method_name) { raise NoMethodError, "Please implement method #{method_name} in your store class" }
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
# ==== Store objects in memory
|
173
|
-
# See HTTParty::Icebox::ClassMethods.cache
|
174
|
-
#
|
175
|
-
class MemoryStore < AbstractStore
|
176
|
-
def initialize(options={})
|
177
|
-
super; @store = {}; self
|
178
|
-
end
|
179
|
-
def set(key, value)
|
180
|
-
Cache.logger.info("Cache: set (#{key})")
|
181
|
-
@store[key] = [Time.now, value]; true
|
182
|
-
end
|
183
|
-
def get(key)
|
184
|
-
data = @store[key][1]
|
185
|
-
Cache.logger.info("Cache: #{data.nil? ? "miss" : "hit"} (#{key})")
|
186
|
-
data
|
187
|
-
end
|
188
|
-
def exists?(key)
|
189
|
-
!@store[key].nil?
|
190
|
-
end
|
191
|
-
def stale?(key)
|
192
|
-
return true unless exists?(key)
|
193
|
-
Time.now - created(key) > @timeout
|
194
|
-
end
|
195
|
-
private
|
196
|
-
def created(key)
|
197
|
-
@store[key][0]
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
# ==== Store objects on the filesystem
|
202
|
-
# See HTTParty::Icebox::ClassMethods.cache
|
203
|
-
#
|
204
|
-
class FileStore < AbstractStore
|
205
|
-
def initialize(options={})
|
206
|
-
super
|
207
|
-
options[:location] ||= Dir::tmpdir
|
208
|
-
@path = Pathname.new( options[:location] )
|
209
|
-
FileUtils.mkdir_p( @path )
|
210
|
-
self
|
211
|
-
end
|
212
|
-
def set(key, value)
|
213
|
-
Cache.logger.info("Cache: set (#{key})")
|
214
|
-
File.open( @path.join(key), 'w' ) { |file| file << Marshal.dump(value) }
|
215
|
-
true
|
216
|
-
end
|
217
|
-
def get(key)
|
218
|
-
data = Marshal.load(File.read( @path.join(key)))
|
219
|
-
Cache.logger.info("Cache: #{data.nil? ? "miss" : "hit"} (#{key})")
|
220
|
-
data
|
221
|
-
end
|
222
|
-
def exists?(key)
|
223
|
-
File.exists?( @path.join(key) )
|
224
|
-
end
|
225
|
-
def stale?(key)
|
226
|
-
return true unless exists?(key)
|
227
|
-
Time.now - created(key) > @timeout
|
228
|
-
end
|
229
|
-
private
|
230
|
-
def created(key)
|
231
|
-
File.mtime( @path.join(key) )
|
232
|
-
end
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
|
-
end
|
237
|
-
end
|
238
|
-
|
239
|
-
|
240
|
-
# Major parts of this code are based on architecture of ApiCache.
|
241
|
-
# Copyright (c) 2008 Martyn Loughran
|
242
|
-
#
|
243
|
-
# Other parts are inspired by the ActiveSupport::Cache in Ruby On Rails.
|
244
|
-
# Copyright (c) 2005-2009 David Heinemeier Hansson
|
245
|
-
#
|
246
|
-
# Permission is hereby granted, free of charge, to any person obtaining
|
247
|
-
# a copy of this software and associated documentation files (the
|
248
|
-
# "Software"), to deal in the Software without restriction, including
|
249
|
-
# without limitation the rights to use, copy, modify, merge, publish,
|
250
|
-
# distribute, sublicense, and/or sell copies of the Software, and to
|
251
|
-
# permit persons to whom the Software is furnished to do so, subject to
|
252
|
-
# the following conditions:
|
253
|
-
#
|
254
|
-
# The above copyright notice and this permission notice shall be
|
255
|
-
# included in all copies or substantial portions of the Software.
|
256
|
-
#
|
257
|
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
258
|
-
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
259
|
-
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
260
|
-
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
261
|
-
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
262
|
-
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
263
|
-
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/test/helper.rb
DELETED