entifier 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "shoulda", ">= 0"
10
+ gem "bundler", "~> 1.0.0"
11
+ gem "jeweler", "~> 1.5.2"
12
+ gem "rcov", ">= 0"
13
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,20 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.5.2)
6
+ bundler (~> 1.0.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rake (0.8.7)
10
+ rcov (0.9.9)
11
+ shoulda (2.11.3)
12
+
13
+ PLATFORMS
14
+ ruby
15
+
16
+ DEPENDENCIES
17
+ bundler (~> 1.0.0)
18
+ jeweler (~> 1.5.2)
19
+ rcov
20
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1,5 @@
1
+ This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
2
+
3
+ This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
4
+
5
+ You should have received a copy of the GNU General Public License along with this program. If not, see <www.gnu.org/licenses/>
data/README.rdoc ADDED
@@ -0,0 +1,33 @@
1
+ = entifier
2
+
3
+ Extract named entities from text.
4
+
5
+ == Installation
6
+
7
+ gem install entifier
8
+
9
+ == Usage
10
+
11
+ require 'rubygems'
12
+ require 'entifier'
13
+ require 'pp'
14
+
15
+ text = "Gordon Brown went London to see Tony Blair."
16
+ pp Entifier.extract(text)
17
+
18
+
19
+ == Contributing to entifier
20
+
21
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
22
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
23
+ * Fork the project
24
+ * Start a feature/bugfix branch
25
+ * Commit and push until you are happy with your contribution
26
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
27
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
28
+
29
+ == Copyright
30
+
31
+ Copyright (c) 2011 Frankie Roberto. See LICENSE.txt for
32
+ further details.
33
+
data/Rakefile ADDED
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "entifier"
16
+ gem.homepage = "http://github.com/rattle/entifier"
17
+ gem.license = "GPLv3"
18
+ gem.summary = %Q{Entifier gem to extract named entities}
19
+ #gem.description = %Q{TODO: longer description of your gem}
20
+ gem.email = "frankie[at]rattlecentral.com"
21
+ gem.authors = ["robl"]
22
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
23
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
24
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
25
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
26
+ end
27
+ Jeweler::RubygemsDotOrgTasks.new
28
+
29
+ require 'rake/testtask'
30
+ Rake::TestTask.new(:test) do |test|
31
+ test.libs << 'lib' << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ end
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "entifier #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.0
data/entifier.gemspec ADDED
@@ -0,0 +1,69 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{entifier}
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["robl"]
12
+ s.date = %q{2011-01-06}
13
+ s.email = %q{frankie[at]rattlecentral.com}
14
+ s.extra_rdoc_files = [
15
+ "LICENSE.txt",
16
+ "README.rdoc"
17
+ ]
18
+ s.files = [
19
+ ".document",
20
+ "Gemfile",
21
+ "Gemfile.lock",
22
+ "LICENSE.txt",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "entifier.gemspec",
27
+ "lib/entifier.rb",
28
+ "lib/entifier/extensions.rb",
29
+ "lib/entifier/init.rb",
30
+ "test/errors/errors.rb",
31
+ "test/helper.rb",
32
+ "test/known_failures/known_failures_test.rb",
33
+ "test/test_entifier.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/rattle/entifier}
36
+ s.licenses = ["GPLv3"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.7}
39
+ s.summary = %q{Entifier gem to extract named entities}
40
+ s.test_files = [
41
+ "test/errors/errors.rb",
42
+ "test/helper.rb",
43
+ "test/known_failures/known_failures_test.rb",
44
+ "test/test_entifier.rb"
45
+ ]
46
+
47
+ if s.respond_to? :specification_version then
48
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
49
+ s.specification_version = 3
50
+
51
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
52
+ s.add_development_dependency(%q<shoulda>, [">= 0"])
53
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
54
+ s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
55
+ s.add_development_dependency(%q<rcov>, [">= 0"])
56
+ else
57
+ s.add_dependency(%q<shoulda>, [">= 0"])
58
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
59
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
60
+ s.add_dependency(%q<rcov>, [">= 0"])
61
+ end
62
+ else
63
+ s.add_dependency(%q<shoulda>, [">= 0"])
64
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
65
+ s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
66
+ s.add_dependency(%q<rcov>, [">= 0"])
67
+ end
68
+ end
69
+
@@ -0,0 +1,21 @@
1
+ class Hash
2
+ # File merb/core_ext/hash.rb, line 166
3
+ def nested_symbolize_keys!
4
+ each do |k,v|
5
+ sym = k.respond_to?(:to_sym) ? k.to_sym : k
6
+ self[sym] = Hash === v ? v.nested_symbolize_keys! : v
7
+ delete(k) unless k == sym
8
+ end
9
+ self
10
+ end
11
+
12
+ def nested_stringify_keys!
13
+ each do |k,v|
14
+ s = k.respond_to?(:to_s) ? k.to_s : k
15
+ self[s] = Hash === v ? v.nested_stringify_keys! : v
16
+ delete(k) unless k == s
17
+ end
18
+ self
19
+ end
20
+
21
+ end
@@ -0,0 +1 @@
1
+ # Added for gem compatibility
data/lib/entifier.rb ADDED
@@ -0,0 +1,110 @@
1
+ %w(extensions).each do |file|
2
+ require File.join(File.dirname(__FILE__), 'entifier', file)
3
+ end
4
+
5
+ class Entifier
6
+
7
+ DAY_NAMES = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
8
+ MONTH_NAMES = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
9
+ INDEXICALS_PRECEDING_APOSTROPHE_S = ["Here", "There", "He", "She", "It", "Now", "Who"]
10
+
11
+ def self.extract(string, options = {})
12
+ options.nested_stringify_keys!
13
+
14
+ entities = []
15
+
16
+ # HACK! This allows comma separated entities to be picked up.
17
+ # string.gsub!(/\,\s/, ", ")
18
+
19
+ # HACK! This allows entities to be at the end and start of conjoining sentences.
20
+ # string.gsub!(/\./, ".") No longer needed
21
+
22
+ # HACK! Remove extra spaces between sentences.
23
+ #string.gsub!(/([\.\?\!])\s\s+/, "\1")
24
+
25
+ # Pre-processor: remove extra spaces (this shouldn't affect which entities are detected,
26
+ # it just makes the output look better)
27
+ string.gsub!(/[[:blank:]]+/, "\s")
28
+
29
+ # HACK! This allows parenthesized entity following other entity to be picked up.
30
+ string.gsub!(/\s\(/, " (")
31
+
32
+
33
+ capitalised_word = /[ÄÅÖA-Z](?:[a-zA-ZÄÅÖÜàâæçéèêëîïôøöûùüÿñ\-\d\&]+|\.(?:[A-Z]\.)*)/
34
+ capitalised_word_phrase = %r{
35
+ (?:\d{4}\s|Dr\.\s)?
36
+ #{capitalised_word}
37
+ (?:
38
+ (?:
39
+ (?:\s+(?:of|for|on|of\sthe|\&|d\'|du|de)|\'s)
40
+ )?
41
+ \s+#{capitalised_word})*
42
+ (?:\s\d+)?
43
+ }x
44
+
45
+
46
+ regex = %r{
47
+ (?:
48
+ (?:
49
+ (?:\A|[\.\?\!\:][\"\']?\s+|\n) # At start of string, or starting new sentence...
50
+ (?:[\"\'\(])? # ...optionally started with quote marks.
51
+ )
52
+ (
53
+ (?:In\s(?:\d{4}\s)?)?
54
+ #{capitalised_word_phrase}(?:\'s)?
55
+ )
56
+ | # --- OR ---
57
+
58
+ [^\.\n\?\!\:\"][[:blank:]][\"\'\(]? # After any non-full-stop followed by a space...
59
+
60
+ (#{capitalised_word_phrase})
61
+ )
62
+ }x
63
+
64
+
65
+ #[\,\'\s\.\Z]
66
+
67
+ string.scan(regex) do |match|
68
+ #entity = match
69
+ if match[0]
70
+ word_count = match[0].split(" ").size
71
+ if word_count > 1
72
+ entity = match[0].gsub(/\A(In(?:\s\d{4})?|The|If|But|Two|(?:One|Two)\sof)\s/, "").gsub(/\'s\Z/, "")
73
+ elsif match[0][-2,2] == "'s"
74
+ entity = match[0].gsub(/\'s\Z/, "")
75
+ elsif match[0] =~ /\A[A-Z]+\Z/
76
+ entity = match[0]
77
+ else
78
+ entity = nil
79
+ end
80
+ else
81
+ entity = match[1]
82
+ end
83
+
84
+ # HACK: These should really be filtered out by the regex.
85
+ if entity
86
+ # entity = entity.strip.gsub(/\'s\Z/, "").gsub(/\AIn\s/, "").gsub(/\AIf\s/, "")
87
+ end
88
+ entity = nil if DAY_NAMES.include?(entity)
89
+ entity = nil if MONTH_NAMES.include?(entity)
90
+ entity = nil if INDEXICALS_PRECEDING_APOSTROPHE_S.include?(entity)
91
+
92
+
93
+ if entity
94
+ entity.gsub!( /((I|i)n\s)(January|Feburary|March|April|May|June|July|August|September|October|November|December)/, "")
95
+ entity.gsub!( /(January|Feburary|March|April|May|June|July|August|September|October|November|December)\s(\d{4}|\d{2})/, "")
96
+ entity.gsub!(/(O|\so)n\s(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)/, "")
97
+
98
+ entity.gsub!(/\A\d+\Z/, "") # Make string blank if it's just numbers.
99
+ entity = nil if entity == "" # If there's nothing left, make it nil.
100
+
101
+ end
102
+
103
+
104
+ entities << entity unless entity.nil? # Don't collect the entity if it's nil
105
+ end
106
+ entities.uniq!
107
+ return entities
108
+ end
109
+
110
+ end
@@ -0,0 +1,59 @@
1
+ require 'test_helper'
2
+ require 'pp'
3
+ class PAErrorsTest < Test::Unit::TestCase
4
+
5
+
6
+
7
+ # PA File : eAP-D979UPS01.eAP-NA-US-GM-Labor-Costs.nitf.xml
8
+ should "extract 'Treasury Department'" do
9
+ assert Entifier.extract("to the Treasury Department Tuesday").include?('Treasury Department')
10
+ end
11
+
12
+ # PA File : eAP-D979UI0O1.eAP-EU-Obama-Queen-apos-s-Song-List.nitf.xml
13
+ should "treat quotes as one string ? i.e. we shouldn't extract Dolly here" do
14
+ assert !Entifier.extract('Hello, Dolly!" Carol Channing').include?('Dolly')
15
+ end
16
+
17
+ should "not treat quote mark as ending a sentence" do
18
+ assert_equal "Frankie", Entifier.extract("\"This should work\" Frankie said.").first
19
+ end
20
+
21
+ # ----
22
+
23
+ should "not extract Feb" do
24
+ assert !Entifier.extract('Nazir had previously allied with the Pakistani government to fight Uzbeks partnered with the international terrorist group. The Feb. 22 communique, a copy of which was obtained by The Associated Press, announced the ').include?('Feb')
25
+ end
26
+
27
+ should "not extract Die" do
28
+ assert !Entifier.extract('FunnyOrDie.com, the comedy video Web site co-founded by Will Ferrell, announced that it had been bought by country star Reba McEntire. The site was temporarily renamed "Reba or Die" and its home page was populated entirely with videos featuring McEntire').include?('Die')
29
+ end
30
+
31
+
32
+ text = <<PA
33
+ President Barack Obama's gift of an iPod to Queen Elizabeth II came loaded with 40 songs from popular Broadway productions, including "The King and I," "West Side Story" and "Dreamgirls." The iPod was given to accompany a rare coffee table book of songs by composers Richard Rodgers and Lorenz Hart, which Obama also gave the queen. Songs on the iPod are: "Oklahoma!" "If I Loved You," Jan Clayton, "Carousel" "You'll Never Walk Alone," Jan Clayton, "Carousel" "There's No Business Like Show Business," Ethel Merman, "Annie Get Your Gun" "Once in Love with Amy (Where's Charley?)," Ray Bolger "Some Enchanted Evening," "South Pacific" "Diamonds Are a Girl's Best Friend," Carol Channing, "Gentlemen Prefer Blondes" "Getting to Know You," Gertrude Lawrence, "The King and I" "Shall We Dance?" Gertrude Lawrence, "The King and I" "I Could Have Danced All Night," Julie Andrews, "My Fair Lady" "I've Grown Accustomed to Her Face," Rex Harrison, "My Fair Lady" "The Party's Over (Bells Are Ringing)," Judy Holliday "Maria," "West Side Story" "Tonight," "West Side Story" "Seventy Six Trombones," "The Music Man" "Everything's Coming up Roses," Ethel Merman, "Gypsy" "The Sound of Music" "Try to Remember," Jerry Orbach, "The Fantasticks" "Camelot," Richard Burton "If Ever I Would Leave You," Robert Goulet, "Camelot" "Hello, Dolly!" Carol Channing "If I Were a Rich Man," Zero Mostel, "Fiddler on the Roof" "People," Barbra Streisand, "Funny Girl" "On a Clear Day (You Can See Forever)," John Cullum "The Impossible Dream," Richard Kiley, "Man of La Mancha" "Mame," Charles Braswell "Cabaret," Liza Minnelli "Aquarius, Ronald Dyson, "Hair' "Send in the Clowns," Judy Collins, "A Little Night Music" "All That Jazz," Chita Rivera, "Chicago" "One," "A Chorus Line" "Tomorrow," Andrea McArdle, "Annie" "Don't Cry for Me Argentina," Patti LuPone, "Evita" "And I Am Telling You I'm Not Going," Jennifer Holliday, "Dreamgirls" "Memory," Elaine Paige, "Cats" "The Best of Times," George Hearn, "La Cage Aux Folles" "I Dreamed a Dream," Aretha Franklin, "Les Miserables" "The Music of the Night," Michael Crawford, "The Phantom of the Opera" "As If We Never Said Goodbye," Elaine Paige, "Sunset Blvd." "Seasons of Love," "Rent"
34
+ PA
35
+
36
+ should "extract 'Ethel Merman'" do
37
+ assert Entifier.extract(text).include?('Ethel Merman')
38
+ end
39
+
40
+
41
+ should "extract 'Christine Lord'" do
42
+ assert Entifier.extract('This would be an added extra, outside their legal framework." But he added it could also be a question of money. "It may well be that it could be done but for it to be done the coroners would have to be given more resources than they are presently being given" Christine Lord, whose son died nearly two years ago of vCJD, urged coroners to carry out the tests as it had the potential to "save lives".').include?('Christine Lord')
43
+ end
44
+
45
+
46
+ #The video, which is on YouTube and the website of the multimedia magazine Don't Panic, which Prowse edits, has become a hit. It is entitled "Pound Force — Alan Duncan MP gets a new garden feature".
47
+ should "extract 'Don't Panic'" do
48
+ assert Entifier.extract('The video, which is on YouTube and the website of the multimedia magazine Don\'t Panic, which Prowse edits, has become a hit.').include?("Don't Panic")
49
+ end
50
+
51
+ should "extract 'Pound Force'" do
52
+ assert Entifier.extract('has become a hit. It is entitled \"Pound Force — Alan Duncan MP gets a new garden feature Panic').include?('Pound Force')
53
+ end
54
+
55
+ should "extract 'Ian Trow" do
56
+ assert Entifier.extract('. Ian Trow, 42, of Deanshanger, Milton Keynes, Buckinghamshire, and a').include?('Ian Trow')
57
+ end
58
+
59
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,18 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+ require 'shoulda'
12
+
13
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
15
+ require 'entifier'
16
+
17
+ class Test::Unit::TestCase
18
+ end
@@ -0,0 +1,13 @@
1
+ require 'test_helper'
2
+
3
+ class KnownFailuresTest < Test::Unit::TestCase
4
+
5
+ should "not extract 'Hawaii' as we don't know that this is a proper known" do
6
+ assert !Entifier.extract("Hawaii basketball coach Bob Nash has agreed to a one-year").include?('Hawaii')
7
+ end
8
+
9
+ should "not extract Vietnam" do
10
+ assert !Entifier.extract("Around the country, Masses at Catholic churches are heavily attended. Vietnam has often come under international criticism for its record on religious and human rights.").include?('Vietnam')
11
+ end
12
+
13
+ end
@@ -0,0 +1,319 @@
1
+ require 'helper'
2
+
3
+ class TestEntifier < Test::Unit::TestCase
4
+
5
+
6
+ should "test_simple_name_at_start_of_string" do
7
+ assert_equal "Frankie Roberto", Entifier.extract("Frankie Roberto did blah.").first
8
+ end
9
+
10
+ def test_simple_name_at_end_of_sentence
11
+ assert_equal "Frankie Roberto", Entifier.extract("It was Frankie Roberto.").first
12
+ end
13
+
14
+ def test_simple_name_in_middle_of_string
15
+ assert_equal "Frankie Roberto", Entifier.extract("If it was Frankie Roberto who did it.").first
16
+ end
17
+
18
+ def test_name_with_apostrophe
19
+ assert_equal "Samantha Harvey", Entifier.extract("Samantha Harvey's first novel is...").first
20
+ end
21
+
22
+ def test_name_before_comma
23
+ assert_equal "Frankie Roberto", Entifier.extract("Call me Frankie Roberto, said the...").first
24
+ end
25
+
26
+ def test_single_word_name_at_start_with_apostrophe
27
+ assert_equal "Britain", Entifier.extract("Britain's nuclear capabilities...").first
28
+ end
29
+
30
+ def test_name_at_start_of_sentence
31
+ assert_equal "Frankie Roberto", Entifier.extract("First sentence. Frankie Roberto is fun.").first
32
+ end
33
+
34
+ def test_single_word_name_at_start_of_sentence
35
+ assert_equal "Britain", Entifier.extract("First sentence. Britain's nuclear capabilities...").first
36
+ end
37
+
38
+ def test_list_of_countries
39
+ entities = Entifier.extract("The US, Canada, Spain, Britain and Israel have confirmed cases of the virus, but no deaths have been reported outside Mexico.")
40
+ assert_equal 6, entities.size
41
+ assert_equal "US", entities.first
42
+ assert_equal "Mexico", entities.last
43
+ end
44
+
45
+ def test_WHO_example
46
+ entities = Entifier.extract("World Health Organization deputy chief Keiji Fukuda was speaking...")
47
+ assert_equal 2, entities.size
48
+ assert_equal "World Health Organization", entities.first
49
+ assert_equal "Keiji Fukuda", entities.last
50
+ end
51
+
52
+ def test_three_word_name
53
+ assert_equal "Dr Keiji Fukuda", Entifier.extract("The expert Dr Keiji Fukuda said it").first
54
+ end
55
+
56
+ def test_four_word_name_at_start_of_string
57
+ assert_equal "Mr Bob Jones Robert", Entifier.extract("Mr Bob Jones Robert said...").first
58
+
59
+ end
60
+
61
+ def test_name_containing_umlaut
62
+ assert_equal "Albrecht Dürer", Entifier.extract("Albrecht Dürer is a famous artist.").first
63
+ end
64
+
65
+ def test_dont_include_capitalised_sentence_starts
66
+ assert_equal 0, Entifier.extract("How now brown cow. Knees up mother brown.").size
67
+ end
68
+
69
+ def test_dont_include_capitalised_new_paragraph_starts
70
+ assert_equal 0, Entifier.extract("How now brown cow.\n\nKnees up mother brown.").size
71
+ end
72
+
73
+ def test_name_containing_of
74
+ assert_equal "Department of Health", Entifier.extract("If Department of Health is looking").first
75
+ end
76
+
77
+ def test_name_containing_of_in_middle_of_string
78
+ assert_equal "Department of Health", Entifier.extract("It has been reported that the Department of Health is looking").first
79
+ end
80
+
81
+ def test_name_continaing_in_in_middle_of_string
82
+ assert_equal "Intergovernmental Panel on Climate Change", Entifier.extract("At the Intergovernmental Panel on Climate Change, experts said...").first
83
+ end
84
+
85
+ def test_sentence_starting_with_in
86
+ assert_equal "Northern Ireland", Entifier.extract("In Northern Ireland the chief").first
87
+ end
88
+
89
+ def test_name_including_hypen
90
+ assert_equal "Mr Rory-Jones", Entifier.extract("He said to Mr Rory-Jones that").first
91
+ end
92
+
93
+ def test_dont_include_day_names
94
+ assert_equal 0, Entifier.extract("She was born on Thursday.").size
95
+ end
96
+
97
+ def test_dont_include_month_names
98
+ assert_equal 0, Entifier.extract("She was born in September.").size
99
+ end
100
+
101
+ def test_dont_include_its
102
+ assert_equal 0, Entifier.extract("It's not too late to...").size
103
+ end
104
+
105
+ def test_names_at_beginning_and_end_of_sentences
106
+ entities = Entifier.extract("He lived in London. Frankie Roberto is his name.")
107
+ assert_equal 2, entities.size
108
+ assert_equal "London", entities.first
109
+ assert_equal "Frankie Roberto", entities.last
110
+
111
+ end
112
+
113
+ def test_abbreviation_with_numeral
114
+ assert_equal "G8", Entifier.extract("At the G8 meeting last week...").first
115
+ end
116
+
117
+ def test_event_preceeded_by_year
118
+ assert_equal "2012 Olympics", Entifier.extract("At the 2012 Olympics, someone will win.").first
119
+ end
120
+
121
+ def dont_test_abbreviation_at_start_of_string_yet
122
+ assert_equal "US", Entifier.extract("USA officials have urged...").first
123
+ end
124
+
125
+ def test_name_with_of_followed_by_two_words
126
+ assert_equal "Department of Clinical Health", Entifier.extract("At the Department of Clinical Health, people were...").first
127
+ end
128
+
129
+ should "double quoted name" do
130
+ assert_equal "Demon Crossroads", Entifier.extract("At the place they call the \"Demon Crossroads\", two people met...").first
131
+ end
132
+
133
+ should "single quoted name" do
134
+ assert_equal "Demon Crossroads", Entifier.extract("At the place they call the 'Demon Crossroads', two people met...").first
135
+ end
136
+
137
+ should "name starting paragraph after header" do
138
+ assert_equal "Britain", Entifier.extract("Worrying times\n\nBritain's nuclear capabilities...").first
139
+ end
140
+
141
+ should "name after sentence ending in question mark" do
142
+ assert_equal "Frankie Roberto", Entifier.extract("Who's the daddy? Frankie Roberto is.").first
143
+ end
144
+
145
+ should "don't include capitalised word after questions mark" do
146
+ assert_equal 0, Entifier.extract("Who's the daddy? What a tricky question.").size
147
+ end
148
+
149
+ should "don't include capitalised word after exclaimation mark" do
150
+ assert_equal 0, Entifier.extract("Oh look! How peculiar.").size
151
+ end
152
+
153
+ should "name including 'of the'" do
154
+ assert_equal "Horseman of the Apocalypse", Entifier.extract("In the Horseman of the Apocalypse...").first
155
+ end
156
+
157
+ should "name included twice" do
158
+ assert_equal 1, Entifier.extract("Frankie Roberto was a man. Frankie Roberto was a mouse.").size
159
+ end
160
+
161
+ should "ignore extra spaces betwen capitalised words" do
162
+ assert_equal "Schools Secretary", Entifier.extract("the Schools Secretary is...").first
163
+ end
164
+
165
+ should "ignore space before two newlines" do
166
+ assert_nil Entifier.extract("A sentence.\s\n\nA new sentence.").first
167
+ end
168
+
169
+ should "allow entity with 'for' in it" do
170
+ assert_equal "Deputy Mayor for Government", Entifier.extract("The office of Deputy Mayor for Government is important.").first
171
+ end
172
+
173
+ should "ignore quote marks at the start of a sentence" do
174
+ assert_nil Entifier.extract("\"Blindly, he walked forwards\"").first
175
+ end
176
+
177
+ should "ignore 'if' at the beginning of a sentence" do
178
+ assert_equal "Frankie Roberto", Entifier.extract("If Frankie Roberto can do something.").first
179
+ end
180
+
181
+ should "allow entities to end in a number" do
182
+ assert_equal "BBC 1", Entifier.extract("And now, on BBC 1, it's...").first
183
+ end
184
+
185
+ should "treat a double line break as starting a new sentence" do
186
+ assert_nil Entifier.extract("the end\n\nThe start").first
187
+ end
188
+
189
+ should "treat a single line break as starting a new sentence" do
190
+ assert_nil Entifier.extract("the end\nThe start").first
191
+ end
192
+
193
+ should "treat full-stop followed by quote mark as starting a new sentence" do
194
+ assert_nil Entifier.extract("process.\" 'Emerging threats").first
195
+ end
196
+
197
+ should "not treat quote mark followed by a comma as ending a sentence" do
198
+ assert_equal "Frankie", Entifier.extract("\"This should work\", Frankie said.").first
199
+ end
200
+
201
+ should "include words with accented e character" do
202
+ assert_equal "Béarn", Entifier.extract("I live in Béarn.").first
203
+ end
204
+
205
+ should "include words with tilded n character" do
206
+ assert_equal "El Niño", Entifier.extract("He's called El Niño.").first
207
+ end
208
+
209
+ should "include words with funny characters" do
210
+ assert_equal "Bjørn Dæhlie", Entifier.extract("He's called Bjørn Dæhlie").first
211
+ end
212
+
213
+ should "include words starting with haloed A" do
214
+ assert_equal "Anders Jonas Ångström", Entifier.extract("His name is Anders Jonas Ångström.").first
215
+ end
216
+
217
+ should "include words starting with umlauted O" do
218
+ assert_equal "Öland", Entifier.extract("I live in Öland.").first
219
+ end
220
+
221
+ should "pick out a phrases with 'for' in it at the start of a sentence" do
222
+ assert_equal "Department for Transport", Entifier.extract("The Department for Transport has finished...").first
223
+ end
224
+
225
+ should "ignore but at the start of sentence" do
226
+ assert_equal "Foreign Secretary David Miliband", Entifier.extract("But Foreign Secretary David Miliband said that...").first
227
+ end
228
+
229
+ should "ignore months" do
230
+ assert_nil Entifier.extract("In the month of April 2008 bad things happened.").first
231
+ end
232
+
233
+ should "pick out abbreviations including ampersand" do
234
+ assert_equal "C&A", Entifier.extract("I used to shop at C&A.").first
235
+ end
236
+
237
+ should "not include 'on Tuesday' as part of a phrase" do
238
+ assert_equal "Commons", Entifier.extract("In the Commons on Tuesday, the...").first
239
+ end
240
+
241
+ should "not include 'On Friday' as an entity" do
242
+ assert_nil Entifier.extract("On Friday, something happened.").first
243
+ end
244
+
245
+ should "detect single word entities after 'If' at the start of a sentence" do
246
+ assert_equal "Frankie", Entifier.extract("If Frankie can do something.").first
247
+ end
248
+
249
+ should "extract 'Florida'" do
250
+ assert Entifier.extract("In Florida, Lynn Orr was waiting").include?('Florida')
251
+ end
252
+
253
+ # PA File : eAP-D979UQDG0.eAP-Officers-Indicted-1st-Ld-Writethru.nitf.xml
254
+ should "extract 'Baltimore'" do
255
+ assert_equal 'Baltimore', Entifier.extract("Two Baltimore police officers beat").first
256
+ end
257
+
258
+ # PA File : eAP-D979UPS01.eAP-NA-US-GM-Labor-Costs.nitf.xml
259
+ should "extract 'GM'" do
260
+ assert_equal "GM", Entifier.extract("GM submitted a progress report").first
261
+ end
262
+
263
+ # PA File : eAP-D979UHQ01.eAP-CB-Puerto-Rico-Agitated-Passenger.nitf.xml
264
+ should "extract FBI" do
265
+ assert_equal "FBI", Entifier.extract('The FBI has arrested').first
266
+ end
267
+
268
+ # PA File : eAP-D979U4P03.eAP-Wall-Street-19th-Ld-Writethru.nitf.xml
269
+ should "extract Russell 2000" do
270
+ assert_equal ["Russell 2000"], Entifier.extract("The Russell 2000 index of smaller companies")
271
+ end
272
+
273
+ # PA File : PRN-3869490en-1.PRN-GXX-HEALTH-HPV-Testing-Study.nitf.xml
274
+ should "extract 'Bill & Melinda Gates Foundation'" do
275
+ assert_equal ["Bill & Melinda Gates Foundation"], Entifier.extract("Funded by the Bill & Melinda Gates Foundation.")
276
+ end
277
+
278
+ should "not include year after 'in'" do
279
+ assert_equal "Appleton", Entifier.extract("In 1924 Appleton began...").first
280
+ end
281
+
282
+ should "extract April Fool's Day" do
283
+ assert_equal ["April Fool's Day"], Entifier.extract(" was being tracked throughout April Fool's Day, more ")
284
+ end
285
+
286
+ should "ignore 'One of' at start of sentence" do
287
+ assert_equal ["Britain"], Entifier.extract("One of Britain's best-known...")
288
+ end
289
+
290
+ should "not extract numbers" do
291
+ assert_equal [], Entifier.extract("The 2-1 victory over...")
292
+ end
293
+
294
+ should "extract 'Cap d' Antibes" do
295
+ assert_equal ["Cap d' Antibes"], Entifier.extract("In Cap d' Antibes, ...")
296
+ end
297
+
298
+ should "find entities at start and end of sentences" do
299
+ assert_equal ["Frankie Roberto", "United Kingdom"], Entifier.extract("name was Frankie Roberto. United Kingdom was...")
300
+ end
301
+
302
+ should "not extract 'Dr'" do
303
+ assert_equal ["Dr. Christopher Ziebell"], Entifier.extract("\"It's a pretty significant issue,\" said Dr. Christopher Ziebell, chief of the emergency department...")
304
+ end
305
+
306
+ should "extract 'B.J. Upton'" do
307
+ assert_equal ["B.J. Upton"], Entifier.extract("The move won't be made until Sunday, when injured outfielders B.J. Upton and ...")
308
+ end
309
+
310
+ should "extract Richard E. Grant" do
311
+ assert_equal ["Richard E. Grant"], Entifier.extract("The film starred Richard E. Grant as...")
312
+ end
313
+
314
+ should "extract 'Fourie du Prez'" do
315
+ assert_equal ["Fourie du Prez"], Entifier.extract("...looking forward to pitting his wits against experienced opposite number Fourie du Preez when the...")
316
+ end
317
+
318
+
319
+ end
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: entifier
3
+ version: !ruby/object:Gem::Version
4
+ hash: 31
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 0
10
+ version: 0.0.0
11
+ platform: ruby
12
+ authors:
13
+ - robl
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-01-06 00:00:00 +00:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ prerelease: false
23
+ name: shoulda
24
+ version_requirements: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ requirement: *id001
34
+ type: :development
35
+ - !ruby/object:Gem::Dependency
36
+ prerelease: false
37
+ name: bundler
38
+ version_requirements: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ hash: 23
44
+ segments:
45
+ - 1
46
+ - 0
47
+ - 0
48
+ version: 1.0.0
49
+ requirement: *id002
50
+ type: :development
51
+ - !ruby/object:Gem::Dependency
52
+ prerelease: false
53
+ name: jeweler
54
+ version_requirements: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ~>
58
+ - !ruby/object:Gem::Version
59
+ hash: 7
60
+ segments:
61
+ - 1
62
+ - 5
63
+ - 2
64
+ version: 1.5.2
65
+ requirement: *id003
66
+ type: :development
67
+ - !ruby/object:Gem::Dependency
68
+ prerelease: false
69
+ name: rcov
70
+ version_requirements: &id004 !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 3
76
+ segments:
77
+ - 0
78
+ version: "0"
79
+ requirement: *id004
80
+ type: :development
81
+ description:
82
+ email: frankie[at]rattlecentral.com
83
+ executables: []
84
+
85
+ extensions: []
86
+
87
+ extra_rdoc_files:
88
+ - LICENSE.txt
89
+ - README.rdoc
90
+ files:
91
+ - .document
92
+ - Gemfile
93
+ - Gemfile.lock
94
+ - LICENSE.txt
95
+ - README.rdoc
96
+ - Rakefile
97
+ - VERSION
98
+ - entifier.gemspec
99
+ - lib/entifier.rb
100
+ - lib/entifier/extensions.rb
101
+ - lib/entifier/init.rb
102
+ - test/errors/errors.rb
103
+ - test/helper.rb
104
+ - test/known_failures/known_failures_test.rb
105
+ - test/test_entifier.rb
106
+ has_rdoc: true
107
+ homepage: http://github.com/rattle/entifier
108
+ licenses:
109
+ - GPLv3
110
+ post_install_message:
111
+ rdoc_options: []
112
+
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ hash: 3
121
+ segments:
122
+ - 0
123
+ version: "0"
124
+ required_rubygems_version: !ruby/object:Gem::Requirement
125
+ none: false
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ hash: 3
130
+ segments:
131
+ - 0
132
+ version: "0"
133
+ requirements: []
134
+
135
+ rubyforge_project:
136
+ rubygems_version: 1.3.7
137
+ signing_key:
138
+ specification_version: 3
139
+ summary: Entifier gem to extract named entities
140
+ test_files:
141
+ - test/errors/errors.rb
142
+ - test/helper.rb
143
+ - test/known_failures/known_failures_test.rb
144
+ - test/test_entifier.rb