semantic_extraction 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Chris Vannoy
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,37 @@
1
+ = semantic_extraction
2
+
3
+ Extract meaningful information from unstructured text with Ruby.
4
+
5
+ Using a variety of APIs (Yahoo Term Extractor and Alchemy are currently supported), semantic_extraction can automatically return a collection of keywords for an arbitrary block of text. If you use Alchemy, it can also return named entities.
6
+
7
+ == The APIs in use
8
+
9
+ * [Yahoo Term Extractor](http://developer.yahoo.com/search/content/V1/termExtraction.html)
10
+
11
+ * [Alchemy API](http://www.alchemyapi.com/api/)
12
+
13
+ == Upcoming To-Dos
14
+
15
+ * Add support for [OpenCalais](http://www.opencalais.com/documentation/opencalais-documentation)
16
+
17
+ * Flesh out the rest of the Alchemy API
18
+
19
+ * Make it possible to dynamically pick with API to use (so its possible to use multiple APIs in the same app)
20
+
21
+ * Make it less fugly.
22
+
23
+ * Tests, tests and more tests.
24
+
25
+ == Note on Patches/Pull Requests
26
+
27
+ * Fork the project.
28
+ * Make your feature addition or bug fix.
29
+ * Add tests for it. This is important so I don't break it in a
30
+ future version unintentionally.
31
+ * Commit, do not mess with rakefile, version, or history.
32
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
33
+ * Send me a pull request. Bonus points for topic branches.
34
+
35
+ == Copyright
36
+
37
+ Copyright (c) 2010 Chris Vannoy. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "semantic_extraction"
8
+ gem.summary = %Q{Extract meaningful information from unstructured text with Ruby}
9
+ gem.description = %Q{Using a variety of APIs (Yahoo term Extractor and Alchemy are currently supported), semantic_extraction can automatically return a collection of keywords for an arbitrary block of text. If using Alchemy, it can also return named entities.}
10
+ gem.email = "chris@chrisvannoy.com"
11
+ gem.homepage = "http://github.com/dummied/semantic_extraction"
12
+ gem.authors = ["Chris Vannoy"]
13
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
14
+ gem.add_dependency "ruby_tubesday"
15
+ gem.add_dependency "nokogiri"
16
+ gem.add_dependency "extlib"
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+ task :test => :check_dependencies
44
+
45
+ task :default => :test
46
+
47
+ require 'rake/rdoctask'
48
+ Rake::RDocTask.new do |rdoc|
49
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
50
+
51
+ rdoc.rdoc_dir = 'rdoc'
52
+ rdoc.title = "semantic_extraction #{version}"
53
+ rdoc.rdoc_files.include('README*')
54
+ rdoc.rdoc_files.include('lib/**/*.rb')
55
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,41 @@
1
+ module SemanticExtraction
2
+ class Alchemy
3
+ STARTER = "http://access.alchemyapi.com/calls/"
4
+
5
+ def self.find_keywords(text)
6
+ prefix = (SemanticExtraction.is_url?(text) ? "url" : "text")
7
+ endpoint = (prefix == "url" ? "URL" : "Text") + "GetKeywords"
8
+ url = STARTER + prefix + "/" + endpoint
9
+ raw = SemanticExtraction.post(url, text, prefix, SemanticExtraction::ALCHEMY_API_KEY)
10
+ h = Nokogiri::XML(raw)
11
+ if (h/"keywords keyword")
12
+ keywords = []
13
+ (h/"keywords keyword").each do |p|
14
+ keywords << p.text
15
+ end
16
+ end
17
+ return keywords
18
+ end
19
+
20
+ def self.find_entities(text)
21
+ prefix = (SemanticExtraction.is_url?(text) ? "url" : "text")
22
+ endpoint = (prefix == "url" ? "URL" : "Text") + "GetRankedNamedEntities"
23
+ url = STARTER + prefix + "/" + endpoint
24
+ raw = SemanticExtraction.post(url, text, prefix, SemanticExtraction::ALCHEMY_API_KEY)
25
+ h = Nokogiri::XML(raw)
26
+ if (h/"entities entity")
27
+ entities = []
28
+ (h/"entities entity").each do |p|
29
+ hashie = Hash.from_xml(p.to_s)["entity"]
30
+ typer = hashie.delete("type")
31
+ if typer
32
+ hashie["entity_type"] = typer
33
+ end
34
+ entities << OpenStruct.new(hashie)
35
+ end
36
+ end
37
+ return entities
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,19 @@
1
+ module SemanticExtraction
2
+ class Yahoo
3
+ STARTER = "http://api.search.yahoo.com/ContentAnalysisService/V1/termExtraction"
4
+
5
+ def self.find_keywords(text)
6
+ prefix = 'context'
7
+ raw = SemanticExtraction.post(STARTER, text, prefix, SemanticExtraction::YAHOO_API_KEY, :appid)
8
+ h = Nokogiri::XML(raw)
9
+ if (h/"Result")
10
+ keywords = []
11
+ (h/"Result").each do |p|
12
+ keywords << p.text
13
+ end
14
+ end
15
+ return keywords
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,73 @@
1
+ require 'ruby_tubesday'
2
+ require 'nokogiri'
3
+ require 'extlib'
4
+ require 'ostruct'
5
+
6
+ module SemanticExtraction
7
+
8
+ # We'll automatically require any extractors in the ./semantic_extraction/extractors directory.
9
+ # And, yes, I know there has to be a better way to handle this. Let me know before I get fed up and Google furiously.
10
+ Dir.entries("./semantic_extraction/extractors").each_with_index do |p, index|
11
+ unless [0,1].include? index
12
+ require "./semantic_extraction/extractors/" + p.sub(".rb", "")
13
+ end
14
+ end
15
+
16
+ # Thrown when you're lacking an api key for the particular api you're using.
17
+ class MissingApiKey < StandardError; end
18
+
19
+ # Thrown when the api you're using doesn't support the method you're attempting.
20
+ # This will become more important when we start mapping out all of the other features in the Alchemy API
21
+ class NotSupportedExtraction < StandardError; end
22
+
23
+ # By default, we assume you want to use Alchemy.
24
+ # To override, just set SemanticExtraction::PREFERRED_EXTRACTOR somewhere.
25
+ def self.preferred_extractor
26
+ defined?(PREFERRED_EXTRACTOR) ? PREFERRED_EXTRACTOR : "alchemy"
27
+ end
28
+
29
+ HTTP = RubyTubesday.new
30
+
31
+ # Will return an array of keywords gleaned from the text you pass in.
32
+ # Both Yahoo and Alchemy will handle a block of text, but Alchemy can also handle a plain URL.
33
+ def self.find_keywords(text)
34
+ klass = SemanticExtraction.const_get(self.preferred_extractor.capitalize)
35
+ if klass.respond_to?(:find_keywords) && defined?(self.preferred_extractor.upcase + "_API_KEY")
36
+ return klass.find_keywords(text)
37
+ elsif !klass.respond_to?(:find_keywords)
38
+ raise NotSupportedExtraction
39
+ else
40
+ raise MissingApiKey
41
+ end
42
+ end
43
+
44
+ # Will return an array of OpenStruct representing the named entities from the text.
45
+ # At the moment, Alchemy is the only one to support this.
46
+ # Down the road, we'll add in OpenCalais and others.
47
+ def self.find_entities(text)
48
+ klass = SemanticExtraction.const_get(self.preferred_extractor.capitalize)
49
+ if klass.respond_to?(:find_entities) && defined?(self.preferred_extractor.upcase + "_API_Key")
50
+ return klass.find_entities(text)
51
+ elsif !klass.respond_to?(:find_entities)
52
+ raise NotSupportedExtraction
53
+ else
54
+ raise MissingApiKey
55
+ end
56
+ end
57
+
58
+ # Posts the url to the API.
59
+ def self.post(url, target, calling_param, api_key, api_param="apikey".to_sym)
60
+ HTTP.post(url, :params => {calling_param => target, api_param => api_key} )
61
+ end
62
+
63
+ # Checks to see if a string is a URL.
64
+ # This is really dumb at the moment, and will likely be refactored in future releases.
65
+ def self.is_url?(link)
66
+ if link[0..3] == "http"
67
+ return true
68
+ else
69
+ return false
70
+ end
71
+ end
72
+
73
+ end
@@ -0,0 +1,65 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{semantic_extraction}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Chris Vannoy"]
12
+ s.date = %q{2010-03-10}
13
+ s.description = %q{Using a variety of APIs (Yahoo term Extractor and Alchemy are currently supported), semantic_extraction can automatically return a collection of keywords for an arbitrary block of text. If using Alchemy, it can also return named entities.}
14
+ s.email = %q{chris@chrisvannoy.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/semantic_extraction.rb",
27
+ "lib/semantic_extraction/extractors/alchemy.rb",
28
+ "lib/semantic_extraction/extractors/yahoo.rb",
29
+ "semantic_extraction.gemspec",
30
+ "test/helper.rb",
31
+ "test/test_semantic_extraction.rb"
32
+ ]
33
+ s.homepage = %q{http://github.com/dummied/semantic_extraction}
34
+ s.rdoc_options = ["--charset=UTF-8"]
35
+ s.require_paths = ["lib"]
36
+ s.rubygems_version = %q{1.3.6}
37
+ s.summary = %q{Extract meaningful information from unstructured text with Ruby}
38
+ s.test_files = [
39
+ "test/helper.rb",
40
+ "test/test_semantic_extraction.rb"
41
+ ]
42
+
43
+ if s.respond_to? :specification_version then
44
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
45
+ s.specification_version = 3
46
+
47
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
48
+ s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
49
+ s.add_runtime_dependency(%q<ruby_tubesday>, [">= 0"])
50
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
51
+ s.add_runtime_dependency(%q<extlib>, [">= 0"])
52
+ else
53
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
54
+ s.add_dependency(%q<ruby_tubesday>, [">= 0"])
55
+ s.add_dependency(%q<nokogiri>, [">= 0"])
56
+ s.add_dependency(%q<extlib>, [">= 0"])
57
+ end
58
+ else
59
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
60
+ s.add_dependency(%q<ruby_tubesday>, [">= 0"])
61
+ s.add_dependency(%q<nokogiri>, [">= 0"])
62
+ s.add_dependency(%q<extlib>, [">= 0"])
63
+ end
64
+ end
65
+
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'semantic_extraction'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestSemanticExtraction < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: semantic_extraction
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Chris Vannoy
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-10 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: thoughtbot-shoulda
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :development
31
+ version_requirements: *id001
32
+ - !ruby/object:Gem::Dependency
33
+ name: ruby_tubesday
34
+ prerelease: false
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ segments:
40
+ - 0
41
+ version: "0"
42
+ type: :runtime
43
+ version_requirements: *id002
44
+ - !ruby/object:Gem::Dependency
45
+ name: nokogiri
46
+ prerelease: false
47
+ requirement: &id003 !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ segments:
52
+ - 0
53
+ version: "0"
54
+ type: :runtime
55
+ version_requirements: *id003
56
+ - !ruby/object:Gem::Dependency
57
+ name: extlib
58
+ prerelease: false
59
+ requirement: &id004 !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ type: :runtime
67
+ version_requirements: *id004
68
+ description: Using a variety of APIs (Yahoo term Extractor and Alchemy are currently supported), semantic_extraction can automatically return a collection of keywords for an arbitrary block of text. If using Alchemy, it can also return named entities.
69
+ email: chris@chrisvannoy.com
70
+ executables: []
71
+
72
+ extensions: []
73
+
74
+ extra_rdoc_files:
75
+ - LICENSE
76
+ - README.rdoc
77
+ files:
78
+ - .document
79
+ - .gitignore
80
+ - LICENSE
81
+ - README.rdoc
82
+ - Rakefile
83
+ - VERSION
84
+ - lib/semantic_extraction.rb
85
+ - lib/semantic_extraction/extractors/alchemy.rb
86
+ - lib/semantic_extraction/extractors/yahoo.rb
87
+ - semantic_extraction.gemspec
88
+ - test/helper.rb
89
+ - test/test_semantic_extraction.rb
90
+ has_rdoc: true
91
+ homepage: http://github.com/dummied/semantic_extraction
92
+ licenses: []
93
+
94
+ post_install_message:
95
+ rdoc_options:
96
+ - --charset=UTF-8
97
+ require_paths:
98
+ - lib
99
+ required_ruby_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ segments:
104
+ - 0
105
+ version: "0"
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ segments:
111
+ - 0
112
+ version: "0"
113
+ requirements: []
114
+
115
+ rubyforge_project:
116
+ rubygems_version: 1.3.6
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Extract meaningful information from unstructured text with Ruby
120
+ test_files:
121
+ - test/helper.rb
122
+ - test/test_semantic_extraction.rb