name_finder 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/COPYING.txt ADDED
@@ -0,0 +1,21 @@
1
+ == Licence (MIT)
2
+
3
+ Copyright (c) 2010-2013 Paul Battley
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,69 @@
1
+ # Name Finder
2
+
3
+ Find names from a know list in a text, taking account of names that may
4
+ overlap. For example, Waterloo and Waterloo East are separate stations;
5
+ NameFinder, knowing both, will not give a false match for Waterloo in a text
6
+ that mentions Waterloo East.
7
+
8
+ ## Examples
9
+
10
+ ```ruby
11
+ require "name_finder"
12
+
13
+ stations = [
14
+ "Bermondsey",
15
+ "South Bermondsey",
16
+ "Southwark",
17
+ "Waterloo",
18
+ "Waterloo East"
19
+ ]
20
+
21
+ nf = NameFinder.new
22
+ stations.each do |station|
23
+ nf.add station
24
+ end
25
+ ```
26
+
27
+ It can find the best matching name even when one name is the same as part of
28
+ another, whether they overlap at the start:
29
+
30
+ ```ruby
31
+ nf.find_in "Change here for trains from Waterloo East"
32
+ # => "Waterloo East"
33
+
34
+ nf.find_in "This train terminates at Waterloo"
35
+ # => "Waterloo"
36
+ ```
37
+
38
+ or at the end:
39
+
40
+ ```ruby
41
+ nf.find_in "Escalator closed at Bermondsey station"
42
+ # => "Bermondsey"
43
+
44
+ nf.find_in "Use South Bermondsey station for Millwall FC"
45
+ # => "South Bermondsey"
46
+ ```
47
+
48
+ It can also find all the matching names, without false positives for names
49
+ that are part of a longer name:
50
+
51
+ ```ruby
52
+ nf.find_all_in "South Bermondsey and Waterloo East"
53
+ # => ["South Bermondsey", "Waterloo East"]
54
+ ```
55
+
56
+ Names that are part of a longer name are still found when listed separately,
57
+ however:
58
+
59
+ ```ruby
60
+ nf.find_all_in "South Bermondsey and Bermondsey"
61
+ # => ["South Bermondsey", "Bermondsey"]
62
+ ```
63
+
64
+ ## Limitations
65
+
66
+ The present implementation handles only the letters A-Z. This can be customised
67
+ by subclassing `NameFinder` and changing the implementation of `normalize`.
68
+ The `normalize` method must use the same delimiter between words as is returned
69
+ by the `delimiter` method (normally a single space).
@@ -0,0 +1,43 @@
1
+ class NameFinder
2
+ class Buffer
3
+ def initialize(string, position = 0)
4
+ @string = string
5
+ @position = position
6
+ @length = string.length
7
+ end
8
+
9
+ def advance_by(n)
10
+ new(@position + n)
11
+ end
12
+
13
+ def advance_past(delimiter)
14
+ p = (@position ... @length).find { |i| @string[i] == delimiter }
15
+ if p
16
+ new(p + 1)
17
+ else
18
+ new(@length)
19
+ end
20
+ end
21
+
22
+ def at_end?
23
+ @position >= @length
24
+ end
25
+
26
+ def head
27
+ @string[@position, 1]
28
+ end
29
+
30
+ def rest
31
+ new(@position + 1)
32
+ end
33
+
34
+ def inspect(*args)
35
+ "<Buffer:#{@string[@position .. -1].inspect}>"
36
+ end
37
+
38
+ private
39
+ def new(position)
40
+ Buffer.new(@string, position)
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,37 @@
1
+ class NameFinder
2
+ class NodeProxy
3
+ def initialize(node, delimiter)
4
+ @node = node
5
+ @delimiter = delimiter
6
+ end
7
+
8
+ attr_reader :node, :delimiter
9
+
10
+ def add(buffer, term)
11
+ if buffer.at_end?
12
+ node[0] = term
13
+ else
14
+ subtree = node[buffer.head] ||= {}
15
+ wrap(subtree).add buffer.rest, term
16
+ end
17
+ end
18
+
19
+ def find(buffer, new_word=false)
20
+ if buffer.at_end?
21
+ node[0]
22
+ else
23
+ head = buffer.head
24
+ if subtree = node[head]
25
+ wrap(subtree).find(buffer.rest, head == delimiter)
26
+ elsif new_word
27
+ node[0]
28
+ end
29
+ end
30
+ end
31
+
32
+ private
33
+ def wrap(node)
34
+ NodeProxy.new(node, delimiter)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ class NameFinder
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,75 @@
1
+ require "name_finder/version"
2
+ require "name_finder/node_proxy"
3
+ require "name_finder/buffer"
4
+ require "set"
5
+
6
+ # Find names from a know list in a text, taking account of names that may
7
+ # overlap. For example, Waterloo and Waterloo East are separate stations;
8
+ # NameFinder, knowing both, will not give a false match for Waterloo in a text
9
+ # that mentions Waterloo East.
10
+ #
11
+ class NameFinder
12
+
13
+ # Initialize a new NameFinder. tree, if supplied, should be the data
14
+ # generated by the export method.
15
+ #
16
+ def initialize(tree={})
17
+ @tree = tree
18
+ @root = NodeProxy.new(tree, delimiter)
19
+ end
20
+
21
+ attr_reader :root
22
+
23
+ # Add a term to NameFinder's dictionary
24
+ #
25
+ def add(term)
26
+ root.add Buffer.new(normalize(term) + delimiter), term
27
+ end
28
+
29
+ # Find the first name from the dictionary in haystack
30
+ #
31
+ def find_in(haystack)
32
+ find(haystack) do |found|
33
+ return found
34
+ end
35
+ nil
36
+ end
37
+
38
+ # Find all names from the dictionary in haystack.
39
+ #
40
+ def find_all_in(haystack)
41
+ Set.new.tap { |all|
42
+ find(haystack) do |found|
43
+ all << found
44
+ end
45
+ }.to_a
46
+ end
47
+
48
+ # Export the tree of the current dictionary for later re-importing.
49
+ #
50
+ def export
51
+ @tree
52
+ end
53
+
54
+ private
55
+ def find(haystack)
56
+ remaining = Buffer.new(normalize(haystack) + delimiter)
57
+ while !remaining.at_end?
58
+ found = root.find(remaining)
59
+ if found
60
+ yield found
61
+ remaining = remaining.advance_by(found.length)
62
+ else
63
+ remaining = remaining.advance_past(delimiter)
64
+ end
65
+ end
66
+ end
67
+
68
+ def normalize(term)
69
+ term.downcase.gsub(/[^a-z]+/, delimiter)
70
+ end
71
+
72
+ def delimiter
73
+ " "
74
+ end
75
+ end
data/perf/benchmark.rb ADDED
@@ -0,0 +1,8 @@
1
+ require File.expand_path("../performance", __FILE__)
2
+ require "benchmark"
3
+
4
+ job = NameFinderJob.new
5
+
6
+ Benchmark.benchmark do |b|
7
+ b.report("find_all"){ job.run }
8
+ end
@@ -0,0 +1,27 @@
1
+ lib_path = File.expand_path("../../lib", __FILE__)
2
+ $:.unshift lib_path unless $:.include?(lib_path)
3
+ require "name_finder"
4
+
5
+ class NameFinderJob
6
+ SOURCE_PATH = File.expand_path("../sample.txt", __FILE__)
7
+
8
+ NAMES = [
9
+ "Mr Bennet",
10
+ "Mrs Bennet",
11
+ "Miss Bingley",
12
+ "Elizabeth",
13
+ "Luther Blissett"
14
+ ]
15
+
16
+ EXPECTED = ["Mr Bennet", "Elizabeth", "Mrs Bennet", "Miss Bingley"]
17
+
18
+ def run
19
+ source = File.read(SOURCE_PATH)
20
+ nf = NameFinder.new
21
+ NAMES.each do |name|
22
+ nf.add name
23
+ end
24
+ result = nf.find_all_in(source)
25
+ raise "Unexpected output: #{result.inspect}" unless result == EXPECTED
26
+ end
27
+ end
@@ -0,0 +1,89 @@
1
+ require File.expand_path("../test_helper", __FILE__)
2
+ require "name_finder"
3
+
4
+ describe NameFinder do
5
+
6
+ subject { NameFinder.new }
7
+
8
+ describe "find_in" do
9
+ it "should find an exact match" do
10
+ subject.add "aa bb"
11
+ subject.find_in("aa bb").must_equal "aa bb"
12
+ end
13
+
14
+ it "should be case insensitive and case preserving" do
15
+ subject.add "Aa bb"
16
+ subject.find_in("AA BB").must_equal "Aa bb"
17
+ end
18
+
19
+ it "should find a substring match with text before" do
20
+ subject.add "aa bb"
21
+ subject.find_in("xx aa bb").must_equal "aa bb"
22
+ end
23
+
24
+ it "should find a substring match with text after" do
25
+ subject.add "aa bb"
26
+ subject.find_in("aa bb xx").must_equal "aa bb"
27
+ end
28
+
29
+ it "should find a substring match with text before and after" do
30
+ subject.add "aa bb"
31
+ subject.find_in("xx aa bb xx").must_equal "aa bb"
32
+ end
33
+
34
+ it "should return nil for no match" do
35
+ subject.find_in("aa").must_be_nil
36
+ end
37
+
38
+ it "should not find a substring that does not end on a word boundary" do
39
+ subject.add "aa bb"
40
+ subject.find_in("aa bbb").must_be_nil
41
+ end
42
+
43
+ it "should not find a substring that does not begin on a word boundary" do
44
+ subject.add "aa bb"
45
+ subject.find_in("aaa bb").must_be_nil
46
+ end
47
+
48
+ it "should find longest exact match" do
49
+ subject.add "aa"
50
+ subject.add "aa bb"
51
+ subject.add "aa bbb"
52
+ subject.add "aa bbbb"
53
+ subject.find_in("xx aa bbb xx").must_equal "aa bbb"
54
+ end
55
+
56
+ it "should find without regard to punctuation" do
57
+ subject.add "Mr Ee"
58
+ subject.find_in("Mr. Ee").must_equal "Mr Ee"
59
+ end
60
+ end
61
+
62
+ describe "find_all_in" do
63
+ it "should find multiple simple matches" do
64
+ subject.add "aa bb"
65
+ subject.add "cc dd"
66
+ subject.find_all_in("aa bb cc dd").must_equal ["aa bb", "cc dd"]
67
+ end
68
+
69
+ it "should not find partially occluded matches" do
70
+ subject.add "aa bb"
71
+ subject.add "bb"
72
+ subject.add "cc dd"
73
+ subject.find_all_in("aa bb cc dd").must_equal ["aa bb", "cc dd"]
74
+ end
75
+
76
+ it "should find each match only once" do
77
+ subject.add "aa bb"
78
+ subject.add "cc dd"
79
+ subject.find_all_in("aa bb cc dd aa bb cc dd").must_equal ["aa bb", "cc dd"]
80
+ end
81
+ end
82
+
83
+ it "should export and import tree" do
84
+ subject.add "test data"
85
+ export = subject.export
86
+ nf2 = NameFinder.new(export)
87
+ nf2.find_in("test data").must_equal "test data"
88
+ end
89
+ end
@@ -0,0 +1,4 @@
1
+ lib_path = File.expand_path("../../lib", __FILE__)
2
+ $:.unshift lib_path unless $:.include?(lib_path)
3
+ require "minitest/spec"
4
+ require "minitest/autorun"
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: name_finder
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - Paul Battley
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-08-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ type: :development
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ! '>='
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ none: false
22
+ prerelease: false
23
+ name: rake
24
+ requirement: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ! '>='
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ none: false
30
+ description: Find matching names in text, taking account of names that overlap but
31
+ are different (Waterloo and Waterloo East stations, for example).
32
+ email: pbattley@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files:
36
+ - COPYING.txt
37
+ - README.md
38
+ files:
39
+ - lib/name_finder/version.rb
40
+ - lib/name_finder/buffer.rb
41
+ - lib/name_finder/node_proxy.rb
42
+ - lib/name_finder.rb
43
+ - test/test_helper.rb
44
+ - test/name_finder_test.rb
45
+ - perf/benchmark.rb
46
+ - perf/performance.rb
47
+ - COPYING.txt
48
+ - README.md
49
+ homepage: https://github.com/threedaymonk/name_finder
50
+ licenses: []
51
+ post_install_message:
52
+ rdoc_options: []
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ! '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ none: false
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ none: false
67
+ requirements: []
68
+ rubyforge_project:
69
+ rubygems_version: 1.8.25
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: Find matching names in text
73
+ test_files:
74
+ - test/name_finder_test.rb