semantic_extraction 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Chris Vannoy
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,37 @@
1
+ = semantic_extraction
2
+
3
+ Extract meaningful information from unstructured text with Ruby.
4
+
5
+ Using a variety of APIs (Yahoo Term Extractor and Alchemy are currently supported), semantic_extraction can automatically return a collection of keywords for an arbitrary block of text. If you use Alchemy, it can also return named entities.
6
+
7
+ == The APIs in use
8
+
9
+ * [Yahoo Term Extractor](http://developer.yahoo.com/search/content/V1/termExtraction.html)
10
+
11
+ * [Alchemy API](http://www.alchemyapi.com/api/)
12
+
13
+ == Upcoming To-Dos
14
+
15
+ * Add support for [OpenCalais](http://www.opencalais.com/documentation/opencalais-documentation)
16
+
17
+ * Flesh out the rest of the Alchemy API
18
+
19
+ * Make it possible to dynamically pick with API to use (so its possible to use multiple APIs in the same app)
20
+
21
+ * Make it less fugly.
22
+
23
+ * Tests, tests and more tests.
24
+
25
+ == Note on Patches/Pull Requests
26
+
27
+ * Fork the project.
28
+ * Make your feature addition or bug fix.
29
+ * Add tests for it. This is important so I don't break it in a
30
+ future version unintentionally.
31
+ * Commit, do not mess with rakefile, version, or history.
32
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
33
+ * Send me a pull request. Bonus points for topic branches.
34
+
35
+ == Copyright
36
+
37
+ Copyright (c) 2010 Chris Vannoy. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "semantic_extraction"
8
+ gem.summary = %Q{Extract meaningful information from unstructured text with Ruby}
9
+ gem.description = %Q{Using a variety of APIs (Yahoo term Extractor and Alchemy are currently supported), semantic_extraction can automatically return a collection of keywords for an arbitrary block of text. If using Alchemy, it can also return named entities.}
10
+ gem.email = "chris@chrisvannoy.com"
11
+ gem.homepage = "http://github.com/dummied/semantic_extraction"
12
+ gem.authors = ["Chris Vannoy"]
13
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
14
+ gem.add_dependency "ruby_tubesday"
15
+ gem.add_dependency "nokogiri"
16
+ gem.add_dependency "extlib"
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+ task :test => :check_dependencies
44
+
45
+ task :default => :test
46
+
47
+ require 'rake/rdoctask'
48
+ Rake::RDocTask.new do |rdoc|
49
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
50
+
51
+ rdoc.rdoc_dir = 'rdoc'
52
+ rdoc.title = "semantic_extraction #{version}"
53
+ rdoc.rdoc_files.include('README*')
54
+ rdoc.rdoc_files.include('lib/**/*.rb')
55
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,41 @@
1
+ module SemanticExtraction
2
+ class Alchemy
3
+ STARTER = "http://access.alchemyapi.com/calls/"
4
+
5
+ def self.find_keywords(text)
6
+ prefix = (SemanticExtraction.is_url?(text) ? "url" : "text")
7
+ endpoint = (prefix == "url" ? "URL" : "Text") + "GetKeywords"
8
+ url = STARTER + prefix + "/" + endpoint
9
+ raw = SemanticExtraction.post(url, text, prefix, SemanticExtraction::ALCHEMY_API_KEY)
10
+ h = Nokogiri::XML(raw)
11
+ if (h/"keywords keyword")
12
+ keywords = []
13
+ (h/"keywords keyword").each do |p|
14
+ keywords << p.text
15
+ end
16
+ end
17
+ return keywords
18
+ end
19
+
20
+ def self.find_entities(text)
21
+ prefix = (SemanticExtraction.is_url?(text) ? "url" : "text")
22
+ endpoint = (prefix == "url" ? "URL" : "Text") + "GetRankedNamedEntities"
23
+ url = STARTER + prefix + "/" + endpoint
24
+ raw = SemanticExtraction.post(url, text, prefix, SemanticExtraction::ALCHEMY_API_KEY)
25
+ h = Nokogiri::XML(raw)
26
+ if (h/"entities entity")
27
+ entities = []
28
+ (h/"entities entity").each do |p|
29
+ hashie = Hash.from_xml(p.to_s)["entity"]
30
+ typer = hashie.delete("type")
31
+ if typer
32
+ hashie["entity_type"] = typer
33
+ end
34
+ entities << OpenStruct.new(hashie)
35
+ end
36
+ end
37
+ return entities
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,19 @@
1
+ module SemanticExtraction
2
+ class Yahoo
3
+ STARTER = "http://api.search.yahoo.com/ContentAnalysisService/V1/termExtraction"
4
+
5
+ def self.find_keywords(text)
6
+ prefix = 'context'
7
+ raw = SemanticExtraction.post(STARTER, text, prefix, SemanticExtraction::YAHOO_API_KEY, :appid)
8
+ h = Nokogiri::XML(raw)
9
+ if (h/"Result")
10
+ keywords = []
11
+ (h/"Result").each do |p|
12
+ keywords << p.text
13
+ end
14
+ end
15
+ return keywords
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,73 @@
1
+ require 'ruby_tubesday'
2
+ require 'nokogiri'
3
+ require 'extlib'
4
+ require 'ostruct'
5
+
6
+ module SemanticExtraction
7
+
8
+ # We'll automatically require any extractors in the ./semantic_extraction/extractors directory.
9
+ # And, yes, I know there has to be a better way to handle this. Let me know before I get fed up and Google furiously.
10
+ Dir.entries("./semantic_extraction/extractors").each_with_index do |p, index|
11
+ unless [0,1].include? index
12
+ require "./semantic_extraction/extractors/" + p.sub(".rb", "")
13
+ end
14
+ end
15
+
16
+ # Thrown when you're lacking an api key for the particular api you're using.
17
+ class MissingApiKey < StandardError; end
18
+
19
+ # Thrown when the api you're using doesn't support the method you're attempting.
20
+ # This will become more important when we start mapping out all of the other features in the Alchemy API
21
+ class NotSupportedExtraction < StandardError; end
22
+
23
+ # By default, we assume you want to use Alchemy.
24
+ # To override, just set SemanticExtraction::PREFERRED_EXTRACTOR somewhere.
25
+ def self.preferred_extractor
26
+ defined?(PREFERRED_EXTRACTOR) ? PREFERRED_EXTRACTOR : "alchemy"
27
+ end
28
+
29
+ HTTP = RubyTubesday.new
30
+
31
+ # Will return an array of keywords gleaned from the text you pass in.
32
+ # Both Yahoo and Alchemy will handle a block of text, but Alchemy can also handle a plain URL.
33
+ def self.find_keywords(text)
34
+ klass = SemanticExtraction.const_get(self.preferred_extractor.capitalize)
35
+ if klass.respond_to?(:find_keywords) && defined?(self.preferred_extractor.upcase + "_API_KEY")
36
+ return klass.find_keywords(text)
37
+ elsif !klass.respond_to?(:find_keywords)
38
+ raise NotSupportedExtraction
39
+ else
40
+ raise MissingApiKey
41
+ end
42
+ end
43
+
44
+ # Will return an array of OpenStruct representing the named entities from the text.
45
+ # At the moment, Alchemy is the only one to support this.
46
+ # Down the road, we'll add in OpenCalais and others.
47
+ def self.find_entities(text)
48
+ klass = SemanticExtraction.const_get(self.preferred_extractor.capitalize)
49
+ if klass.respond_to?(:find_entities) && defined?(self.preferred_extractor.upcase + "_API_Key")
50
+ return klass.find_entities(text)
51
+ elsif !klass.respond_to?(:find_entities)
52
+ raise NotSupportedExtraction
53
+ else
54
+ raise MissingApiKey
55
+ end
56
+ end
57
+
58
+ # Posts the url to the API.
59
+ def self.post(url, target, calling_param, api_key, api_param="apikey".to_sym)
60
+ HTTP.post(url, :params => {calling_param => target, api_param => api_key} )
61
+ end
62
+
63
+ # Checks to see if a string is a URL.
64
+ # This is really dumb at the moment, and will likely be refactored in future releases.
65
+ def self.is_url?(link)
66
+ if link[0..3] == "http"
67
+ return true
68
+ else
69
+ return false
70
+ end
71
+ end
72
+
73
+ end
@@ -0,0 +1,65 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{semantic_extraction}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Chris Vannoy"]
12
+ s.date = %q{2010-03-10}
13
+ s.description = %q{Using a variety of APIs (Yahoo term Extractor and Alchemy are currently supported), semantic_extraction can automatically return a collection of keywords for an arbitrary block of text. If using Alchemy, it can also return named entities.}
14
+ s.email = %q{chris@chrisvannoy.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/semantic_extraction.rb",
27
+ "lib/semantic_extraction/extractors/alchemy.rb",
28
+ "lib/semantic_extraction/extractors/yahoo.rb",
29
+ "semantic_extraction.gemspec",
30
+ "test/helper.rb",
31
+ "test/test_semantic_extraction.rb"
32
+ ]
33
+ s.homepage = %q{http://github.com/dummied/semantic_extraction}
34
+ s.rdoc_options = ["--charset=UTF-8"]
35
+ s.require_paths = ["lib"]
36
+ s.rubygems_version = %q{1.3.6}
37
+ s.summary = %q{Extract meaningful information from unstructured text with Ruby}
38
+ s.test_files = [
39
+ "test/helper.rb",
40
+ "test/test_semantic_extraction.rb"
41
+ ]
42
+
43
+ if s.respond_to? :specification_version then
44
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
45
+ s.specification_version = 3
46
+
47
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
48
+ s.add_development_dependency(%q<thoughtbot-shoulda>, [">= 0"])
49
+ s.add_runtime_dependency(%q<ruby_tubesday>, [">= 0"])
50
+ s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
51
+ s.add_runtime_dependency(%q<extlib>, [">= 0"])
52
+ else
53
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
54
+ s.add_dependency(%q<ruby_tubesday>, [">= 0"])
55
+ s.add_dependency(%q<nokogiri>, [">= 0"])
56
+ s.add_dependency(%q<extlib>, [">= 0"])
57
+ end
58
+ else
59
+ s.add_dependency(%q<thoughtbot-shoulda>, [">= 0"])
60
+ s.add_dependency(%q<ruby_tubesday>, [">= 0"])
61
+ s.add_dependency(%q<nokogiri>, [">= 0"])
62
+ s.add_dependency(%q<extlib>, [">= 0"])
63
+ end
64
+ end
65
+
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'semantic_extraction'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestSemanticExtraction < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,122 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: semantic_extraction
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Chris Vannoy
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-10 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: thoughtbot-shoulda
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :development
31
+ version_requirements: *id001
32
+ - !ruby/object:Gem::Dependency
33
+ name: ruby_tubesday
34
+ prerelease: false
35
+ requirement: &id002 !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ segments:
40
+ - 0
41
+ version: "0"
42
+ type: :runtime
43
+ version_requirements: *id002
44
+ - !ruby/object:Gem::Dependency
45
+ name: nokogiri
46
+ prerelease: false
47
+ requirement: &id003 !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ segments:
52
+ - 0
53
+ version: "0"
54
+ type: :runtime
55
+ version_requirements: *id003
56
+ - !ruby/object:Gem::Dependency
57
+ name: extlib
58
+ prerelease: false
59
+ requirement: &id004 !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ segments:
64
+ - 0
65
+ version: "0"
66
+ type: :runtime
67
+ version_requirements: *id004
68
+ description: Using a variety of APIs (Yahoo term Extractor and Alchemy are currently supported), semantic_extraction can automatically return a collection of keywords for an arbitrary block of text. If using Alchemy, it can also return named entities.
69
+ email: chris@chrisvannoy.com
70
+ executables: []
71
+
72
+ extensions: []
73
+
74
+ extra_rdoc_files:
75
+ - LICENSE
76
+ - README.rdoc
77
+ files:
78
+ - .document
79
+ - .gitignore
80
+ - LICENSE
81
+ - README.rdoc
82
+ - Rakefile
83
+ - VERSION
84
+ - lib/semantic_extraction.rb
85
+ - lib/semantic_extraction/extractors/alchemy.rb
86
+ - lib/semantic_extraction/extractors/yahoo.rb
87
+ - semantic_extraction.gemspec
88
+ - test/helper.rb
89
+ - test/test_semantic_extraction.rb
90
+ has_rdoc: true
91
+ homepage: http://github.com/dummied/semantic_extraction
92
+ licenses: []
93
+
94
+ post_install_message:
95
+ rdoc_options:
96
+ - --charset=UTF-8
97
+ require_paths:
98
+ - lib
99
+ required_ruby_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ segments:
104
+ - 0
105
+ version: "0"
106
+ required_rubygems_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ segments:
111
+ - 0
112
+ version: "0"
113
+ requirements: []
114
+
115
+ rubyforge_project:
116
+ rubygems_version: 1.3.6
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Extract meaningful information from unstructured text with Ruby
120
+ test_files:
121
+ - test/helper.rb
122
+ - test/test_semantic_extraction.rb