RubyGems - term-extract - Versions diffs - 0.2.0 - Mend

term-extract 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+lib/**/*.rb
+bin/*
+-
+features/**/*.feature
+LICENSE.txt

data/Gemfile ADDED Viewed

@@ -0,0 +1,15 @@
+source "http://rubygems.org"
+# Add dependencies required to use your gem here.
+# Example:
+#   gem "activesupport", ">= 2.3.5"
+gem 'rbtagger', ">=0"
+# Add dependencies to develop your gem here.
+# Include everything needed to run rake, tests, features, etc.
+group :development do
+  gem "shoulda", ">= 0"
+  gem "bundler", "~> 1.0.0"
+  gem "jeweler", "~> 1.5.2"
+  gem "rcov", ">= 0"
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,22 @@
+GEM
+  remote: http://rubygems.org/
+  specs:
+    git (1.2.5)
+    jeweler (1.5.2)
+      bundler (~> 1.0.0)
+      git (>= 1.2.5)
+      rake
+    rake (0.8.7)
+    rbtagger (0.4.6)
+    rcov (0.9.9)
+    shoulda (2.11.3)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.0.0)
+  jeweler (~> 1.5.2)
+  rbtagger
+  rcov
+  shoulda

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,5 @@
+This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+You should have received a copy of the GNU General Public License along with this program. If not, see <www.gnu.org/licenses/>

data/README.markdown ADDED Viewed

@@ -0,0 +1,57 @@
+# term_extract - Term Extract
+## Description:
+term_extract extracts proper nouns (named things like 'Manchester United') and ordinary nouns (like 'event') from text documents.
+## Usage:
+An example extracting terms from a piece of content:
+    require 'term_extract'
+    content = <<DOC
+    Business Secretary Vince Cable will stay in cabinet despite
+    "declaring war" on Rupert Murdoch, says Downing Street.
+    DOC
+    terms = TermExtract.extract(content)
+## Options
+The #extract method takes an (optional) options hash, that allows the term extractor behaviour to be modified.  The following options are available:
+* min_occurance - The minimum number of times a single word term must occur to be included in the results, default 3
+* min_terms - Always include multiword terms that comprise more than @min_terms words, default 2
+* types - Extract proper nouns (:nnp) or nouns (:nn) or both (:all), default :all
+* include_tags - Include the extracted POS tags in the results, default false
+Sample usage:
+    terms = TermExtract.extract(content, :types => :nnp, :include_tags => true)
+## Term Extraction Types
+By default, the term extractor attempts to extract both ordinary nouns and proper nouns, this behaviour can be configured using the #types option and specifying :all (for both), :nn (for ordinary nouns) or :nnp (for proper nouns).  These codes correspond to the relevent POS tags used during the term extraction process.  Sample usage is shown below:
+    terms = TermExtract.extract(content, :types => :nnp)
+## Note on Patches/Pull Requests
+* Fork the project.
+* Make your feature addition or bug fix.
+* Add tests for it. This is important so I don't break it in a future version unintentionally.
+* Commit, do not mess with Rakefile, version, or history as it's handled by Jeweler.
+* Send me a pull request. I may or may not accept it.
+## Acknowledgements
+The algorithm and extraction code is based on the original python code at:
+http://pypi.python.org/pypi/topia.termextract/
+## Copyright and License
+GPL v3 - See LICENSE.txt for details.
+Copyright (c) 2010, Rob Lee

data/Rakefile ADDED Viewed

@@ -0,0 +1,53 @@
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'rake'
+require 'jeweler'
+Jeweler::Tasks.new do |gem|
+  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
+  gem.name = "term-extract"
+  gem.homepage = "http://github.com/rattle/term-extract"
+  gem.license = "GPLv3"
+  gem.summary = %Q{Provides term extraction functionality}
+  gem.email = "robl@rjlee.net"
+  gem.authors = ["rattle"]
+  # Include your dependencies below. Runtime dependencies are required when using your gem,
+  # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
+  #  gem.add_runtime_dependency 'jabber4r', '> 0.1'
+  #  gem.add_development_dependency 'rspec', '> 1.2.3'
+  gem.add_dependency('rbtagger', '>= 0.0.0')
+end
+Jeweler::RubygemsDotOrgTasks.new
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+require 'rcov/rcovtask'
+Rcov::RcovTask.new do |test|
+  test.libs << 'test'
+  test.pattern = 'test/**/test_*.rb'
+  test.verbose = true
+end
+task :default => :test
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  version = File.exist?('VERSION') ? File.read('VERSION') : ""
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "term-extract #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.2.0

data/lib/term-extract.rb ADDED Viewed

@@ -0,0 +1,148 @@
+require 'rbtagger'
+# Based on :
+# http://pypi.python.org/pypi/topia.termextract/
+class TermExtract
+  @@SEARCH=0
+  @@NOUN=1
+  @@TAGGER = Brill::Tagger.new
+  attr_accessor :min_occurance, :min_terms, :types, :include_tags, :lazy
+  # Provide a class method for syntactic sugar
+  def self.extract(content, options = {})
+    te = new(options)
+    te.extract(content)
+  end
+  def initialize(options = {})
+    # The minimum number of times a single word term must occur to be included in the results
+    @min_occurance = options.key?(:min_occurance) ? options.delete(:min_occurance) : 3
+    # Always include multiword terms that comprise more than @min_terms words
+    @min_terms = options.key?(:min_terms) ? options.delete(:min_terms) : 2
+    # Extract proper nouns (:nnp) or nouns (:nn) or both (:all)
+    @types = options.key?(:types) ? options.delete(:types) : :all
+    # Include the extracted POS tags in the results
+    @include_tags = options.key?(:include_tags) ? options.delete(:include_tags) : false
+    #@lazy = options.key?(:lazy) ? options.delete(:lazy) : false
+  end
+  def extract(content)
+    tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
+    # Tidy content punctuation
+    # Add a space after periods
+    content.gsub!(/([A-Za-z0-9])\./, '\1. ')
+    # Add in full stops to tag list to allow multiterms to work
+    tags = []
+    tagger.tag(content).each do |tag|
+      if tag[0] =~ /\.$/
+        tag[0].chop!
+        tags.push tag
+        tags.push ['.', '.']
+      else
+         tags.push tag
+      end
+    end
+    # Set pos tags that identify nouns
+    pos = "^NN"
+    case @types
+    when :nn
+      pos = "^(NN|NNS)$"
+    when :nnp
+      pos = "^(NNP|NNPS)$"
+    end
+    terms = Hash.new()
+    multiterm = []
+    last_tag = ''
+    state = @@SEARCH
+    # Iterate through term list and identify nouns
+    tags.each do |term,tag|
+      if state == @@SEARCH and tag =~ /#{pos}/
+        # In search mode, found a noun
+        state = @@NOUN
+        add_term(term, tag, multiterm, terms)
+      elsif state == @@SEARCH and tag == 'JJ' and term =~ /^[A-Z]/ #and @lazy
+        # Allow things like 'Good' at the start of sentences
+        state = @@NOUN
+        add_term(term, tag, multiterm, terms)
+      elsif state == @@NOUN and tag == 'POS'
+        # Allow nouns with apostrophes : St Paul's Cathedral
+        multiterm << [term,tag]
+      elsif state == @@NOUN and last_tag =~ /^(NNP|NNPS)$/ and tag == 'IN' and term =~ /(of|for|on|of\sthe|\&|d\'|du|de)/i
+        # Allow preposition : "Secretary of State"
+        # Doesn't support "Chair of the Parades Commission"
+        # Only use when in NNP mode
+        multiterm << [term,tag]
+      elsif state == @@NOUN and tag =~ /#{pos}/
+        # In noun mode, found a noun, add a multiterm noun
+        add_term(term, tag, multiterm, terms)
+      elsif state == @@NOUN and tag !=~ /#{pos}/
+        # In noun mode, found a non-noun, do we have a possible multiterm ?
+        state = @@SEARCH
+        add_multiterm(multiterm, terms) if multiterm.length > 1
+        multiterm = []
+      end
+      last_tag = tag
+    end
+    # Check the last term wasn't a possible multiterm
+    add_multiterm(multiterm, terms)  if last_tag =~ /#{pos}/
+    # Filter out terms that don't meet minimum requirements
+    # It's possible for a term with multiple words to be returned even if it doesn't
+    # meet the min_occurance requirements (as a multiterm noun is very likely to be
+    # correct)
+    terms.each_key do |term|
+      occur = terms[term][:occurances]
+      strength = term.split(/ /).length
+      terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
+    end
+    # Filter out tags unless required
+    unless @include_tags
+      terms.each_key { |term| terms[term] = terms[term][:occurances] }
+    end
+    terms
+  end
+  protected
+  def add_term(term, tag, multiterm, terms)
+    multiterm << ([term, tag])
+    increment_term(term, tag, terms)
+  end
+  def add_multiterm(multiterm, terms)
+    multiterm.each { |rec| terms[rec[0]][:occurances] -=1 if terms.key?(rec[0]) && terms[rec[0]][:occurances] > 0 }
+    word = ''
+    multiterm.each_with_index do |term, index|
+      if (multiterm[index] == multiterm.last && term[1] == 'POS')
+        # Don't add a final 's if it's the last term
+      else
+        # Don't require a space for POS type concats
+        word+= term[1] == 'POS' ? term[0] : " #{term[0]}"
+      end
+    end
+    word.lstrip!
+    increment_term(word, 'NNP', terms)
+  end
+  def increment_term(term, tag, terms)
+    if terms.key?(term)
+      terms[term][:occurances] += 1
+    else
+      terms[term] = {}
+      terms[term][:occurances] = 1
+    end
+    terms[term][:tag] = tag
+  end
+end

data/term-extract.gemspec ADDED Viewed

@@ -0,0 +1,69 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{term-extract}
+  s.version = "0.2.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["rattle"]
+  s.date = %q{2010-12-23}
+  s.email = %q{robl@rjlee.net}
+  s.extra_rdoc_files = [
+    "LICENSE.txt",
+    "README.markdown"
+  ]
+  s.files = [
+    ".document",
+    "Gemfile",
+    "Gemfile.lock",
+    "LICENSE.txt",
+    "README.markdown",
+    "Rakefile",
+    "VERSION",
+    "lib/term-extract.rb",
+    "term-extract.gemspec",
+    "test/helper.rb",
+    "test/test_term-extract.rb"
+  ]
+  s.homepage = %q{http://github.com/rattle/term-extract}
+  s.licenses = ["GPLv3"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.7}
+  s.summary = %q{Provides term extraction functionality}
+  s.test_files = [
+    "test/helper.rb",
+    "test/test_term-extract.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<rbtagger>, [">= 0"])
+      s.add_development_dependency(%q<shoulda>, [">= 0"])
+      s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
+      s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_development_dependency(%q<rcov>, [">= 0"])
+      s.add_runtime_dependency(%q<rbtagger>, [">= 0.0.0"])
+    else
+      s.add_dependency(%q<rbtagger>, [">= 0"])
+      s.add_dependency(%q<shoulda>, [">= 0"])
+      s.add_dependency(%q<bundler>, ["~> 1.0.0"])
+      s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+      s.add_dependency(%q<rcov>, [">= 0"])
+      s.add_dependency(%q<rbtagger>, [">= 0.0.0"])
+    end
+  else
+    s.add_dependency(%q<rbtagger>, [">= 0"])
+    s.add_dependency(%q<shoulda>, [">= 0"])
+    s.add_dependency(%q<bundler>, ["~> 1.0.0"])
+    s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
+    s.add_dependency(%q<rcov>, [">= 0"])
+    s.add_dependency(%q<rbtagger>, [">= 0.0.0"])
+  end
+end

data/test/helper.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'rubygems'
+require 'bundler'
+begin
+  Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+  $stderr.puts e.message
+  $stderr.puts "Run `bundle install` to install missing gems"
+  exit e.status_code
+end
+require 'test/unit'
+require 'shoulda'
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'term-extract'
+class Test::Unit::TestCase
+end

data/test/test_term-extract.rb ADDED Viewed

@@ -0,0 +1,174 @@
+require 'helper'
+class TestTermExtract < Test::Unit::TestCase
+  @@DOC1 = <<DOC1
+The London Stock Exchange is a stock exchange located in London, United Kingdom.
+Founded in 1801, it is one of the largest stock exchanges in the world, with many
+overseas listings as well as British companies. The exchange is part of the
+London Stock Exchange Group and so sometimes referred to by the ticker symbol
+for the group, LSE. Its current premises are situated in Paternoster Square
+close to St Paul's Cathedral in the City of London
+DOC1
+  @@DOC2 = <<DOC2
+Secretary of State Owen Paterson has appointed Peter Osborne as Chair of the
+Parades Commission for Northern Ireland and six new Commission members.
+DOC2
+  @@DOCUMENT = <<SOURCE
+Police shut Palestinian theatre in Jerusalem.
+Israeli police have shut down a Palestinian theatre in East Jerusalem.
+The action, on Thursday, prevented the closing event of an international
+literature festival from taking place.
+Police said they were acting on a court order, issued after intelligence
+indicated that the Palestinian Authority was involved in the event.
+Israel has occupied East Jerusalem since 1967 and has annexed the
+area. This is not recognised by the international community.
+The British consul-general in Jerusalem , Richard Makepeace, was
+attending the event.
+"I think all lovers of literature would regard this as a very
+regrettable moment and regrettable decision," he added.
+Mr Makepeace said the festival's closing event would be reorganised to
+take place at the British Council in Jerusalem.
+The Israeli authorities often take action against events in East
+Jerusalem they see as connected to the Palestinian Authority.
+Saturday's opening event at the same theatre was also shut down.
+A police notice said the closure was on the orders of Israel's internal
+security minister on the grounds of a breach of interim peace accords
+from the 1990s.
+These laid the framework for talks on establishing a Palestinian state
+alongside Israel, but left the status of Jerusalem to be determined by
+further negotiation.
+Israel has annexed East Jerusalem and declares it part of its eternal
+capital.
+Palestinians hope to establish their capital in the area.
+SOURCE
+  @@TERMS = [
+    'British Council',
+    'British consul-general',
+    'East Jerusalem',
+    'Israel',
+    'Israeli authorities',
+    'Israeli police',
+    'Mr Makepeace',
+    'Palestinian Authority',
+    'Palestinian state',
+    'Palestinian theatre',
+    'Palestinians hope',
+    'Richard Makepeace',
+    'court order',
+    'event',
+    'literature festival',
+    'peace accords',
+    'police notice',
+    'security minister'
+  ]
+  context "Without a default term extractor" do
+    should "extract terms from a document" do
+      terms = TermExtract.extract(@@DOCUMENT)
+      @@TERMS.each do |term|
+        assert terms.keys.include?(term), "#{term} not found"
+      end
+    end
+  end
+  context "With a default term extractor" do
+    setup do
+      @te = TermExtract.new()
+    end
+    should "extract terms from a document" do
+      terms = @te.extract(@@DOCUMENT)
+      @@TERMS.each do |term|
+        assert terms.keys.include?(term), "#{term} not found"
+      end
+    end
+    should "extract terms with apostrophes in" do
+      terms = @te.extract(@@DOC1)
+      assert terms.keys.include?("St Paul's Cathedral")
+    end
+    should "extract terms with joining words" do
+      terms = @te.extract(@@DOC2)
+      assert terms.keys.include?("Secretary of State Owen Paterson")
+    end
+    should "extract terms and include pos tags when configured to" do
+      @te.include_tags = true
+      terms = @te.extract(@@DOCUMENT)
+      term = terms.keys.first
+      assert terms[term].key?(:tag)
+      assert terms[term][:tag]
+    end
+    should "extract common nouns when configured to" do
+      @te.types = :nn
+      terms = @te.extract(@@DOCUMENT)
+      assert terms.length == 11
+    end
+    context "with min_occurance set to 2" do
+      setup do
+        @te.min_occurance=2
+      end
+      should "extract terms that occur equal to or more than min_occurance" do
+        terms = @te.extract(@@DOCUMENT)
+        assert terms.keys.include?("Police")
+        assert terms['Police'] == @te.min_occurance
+      end
+    end
+    context "with min_terms set to 3" do
+      setup do
+        @te.min_terms=3
+      end
+      should "extract terms that have the same number of words as min_terms" do
+        terms = @te.extract(@@DOCUMENT)
+        assert terms.keys.include?("Saturday's opening event")
+      end
+    end
+    context "with include_tags set to true" do
+      setup do
+        @te.include_tags=true
+      end
+      should "include pos tags in the results" do
+        terms = @te.extract(@@DOCUMENT)
+        assert terms.keys.include?("Jerusalem")
+        assert terms['Jerusalem'][:tag] == 'NNP'
+      end
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,168 @@
+--- !ruby/object:Gem::Specification
+name: term-extract
+version: !ruby/object:Gem::Version
+  hash: 23
+  prerelease: false
+  segments:
+  - 0
+  - 2
+  - 0
+  version: 0.2.0
+platform: ruby
+authors:
+- rattle
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-12-23 00:00:00 +00:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  prerelease: false
+  name: rbtagger
+  version_requirements: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  requirement: *id001
+  type: :runtime
+- !ruby/object:Gem::Dependency
+  prerelease: false
+  name: shoulda
+  version_requirements: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  requirement: *id002
+  type: :development
+- !ruby/object:Gem::Dependency
+  prerelease: false
+  name: bundler
+  version_requirements: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 23
+        segments:
+        - 1
+        - 0
+        - 0
+        version: 1.0.0
+  requirement: *id003
+  type: :development
+- !ruby/object:Gem::Dependency
+  prerelease: false
+  name: jeweler
+  version_requirements: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 7
+        segments:
+        - 1
+        - 5
+        - 2
+        version: 1.5.2
+  requirement: *id004
+  type: :development
+- !ruby/object:Gem::Dependency
+  prerelease: false
+  name: rcov
+  version_requirements: &id005 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  requirement: *id005
+  type: :development
+- !ruby/object:Gem::Dependency
+  prerelease: false
+  name: rbtagger
+  version_requirements: &id006 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 31
+        segments:
+        - 0
+        - 0
+        - 0
+        version: 0.0.0
+  requirement: *id006
+  type: :runtime
+description:
+email: robl@rjlee.net
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE.txt
+- README.markdown
+files:
+- .document
+- Gemfile
+- Gemfile.lock
+- LICENSE.txt
+- README.markdown
+- Rakefile
+- VERSION
+- lib/term-extract.rb
+- term-extract.gemspec
+- test/helper.rb
+- test/test_term-extract.rb
+has_rdoc: true
+homepage: http://github.com/rattle/term-extract
+licenses:
+- GPLv3
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: Provides term extraction functionality
+test_files:
+- test/helper.rb
+- test/test_term-extract.rb