name_finder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/COPYING.txt ADDED
@@ -0,0 +1,21 @@
1
+ == Licence (MIT)
2
+
3
+ Copyright (c) 2010-2013 Paul Battley
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,69 @@
1
+ # Name Finder
2
+
3
+ Find names from a know list in a text, taking account of names that may
4
+ overlap. For example, Waterloo and Waterloo East are separate stations;
5
+ NameFinder, knowing both, will not give a false match for Waterloo in a text
6
+ that mentions Waterloo East.
7
+
8
+ ## Examples
9
+
10
+ ```ruby
11
+ require "name_finder"
12
+
13
+ stations = [
14
+ "Bermondsey",
15
+ "South Bermondsey",
16
+ "Southwark",
17
+ "Waterloo",
18
+ "Waterloo East"
19
+ ]
20
+
21
+ nf = NameFinder.new
22
+ stations.each do |station|
23
+ nf.add station
24
+ end
25
+ ```
26
+
27
+ It can find the best matching name even when one name is the same as part of
28
+ another, whether they overlap at the start:
29
+
30
+ ```ruby
31
+ nf.find_in "Change here for trains from Waterloo East"
32
+ # => "Waterloo East"
33
+
34
+ nf.find_in "This train terminates at Waterloo"
35
+ # => "Waterloo"
36
+ ```
37
+
38
+ or at the end:
39
+
40
+ ```ruby
41
+ nf.find_in "Escalator closed at Bermondsey station"
42
+ # => "Bermondsey"
43
+
44
+ nf.find_in "Use South Bermondsey station for Millwall FC"
45
+ # => "South Bermondsey"
46
+ ```
47
+
48
+ It can also find all the matching names, without false positives for names
49
+ that are part of a longer name:
50
+
51
+ ```ruby
52
+ nf.find_all_in "South Bermondsey and Waterloo East"
53
+ # => ["South Bermondsey", "Waterloo East"]
54
+ ```
55
+
56
+ Names that are part of a longer name are still found when listed separately,
57
+ however:
58
+
59
+ ```ruby
60
+ nf.find_all_in "South Bermondsey and Bermondsey"
61
+ # => ["South Bermondsey", "Bermondsey"]
62
+ ```
63
+
64
+ ## Limitations
65
+
66
+ The present implementation handles only the letters A-Z. This can be customised
67
+ by subclassing `NameFinder` and changing the implementation of `normalize`.
68
+ The `normalize` method must use the same delimiter between words as is returned
69
+ by the `delimiter` method (normally a single space).
@@ -0,0 +1,43 @@
1
+ class NameFinder
2
+ class Buffer
3
+ def initialize(string, position = 0)
4
+ @string = string
5
+ @position = position
6
+ @length = string.length
7
+ end
8
+
9
+ def advance_by(n)
10
+ new(@position + n)
11
+ end
12
+
13
+ def advance_past(delimiter)
14
+ p = (@position ... @length).find { |i| @string[i] == delimiter }
15
+ if p
16
+ new(p + 1)
17
+ else
18
+ new(@length)
19
+ end
20
+ end
21
+
22
+ def at_end?
23
+ @position >= @length
24
+ end
25
+
26
+ def head
27
+ @string[@position, 1]
28
+ end
29
+
30
+ def rest
31
+ new(@position + 1)
32
+ end
33
+
34
+ def inspect(*args)
35
+ "<Buffer:#{@string[@position .. -1].inspect}>"
36
+ end
37
+
38
+ private
39
+ def new(position)
40
+ Buffer.new(@string, position)
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,37 @@
1
+ class NameFinder
2
+ class NodeProxy
3
+ def initialize(node, delimiter)
4
+ @node = node
5
+ @delimiter = delimiter
6
+ end
7
+
8
+ attr_reader :node, :delimiter
9
+
10
+ def add(buffer, term)
11
+ if buffer.at_end?
12
+ node[0] = term
13
+ else
14
+ subtree = node[buffer.head] ||= {}
15
+ wrap(subtree).add buffer.rest, term
16
+ end
17
+ end
18
+
19
+ def find(buffer, new_word=false)
20
+ if buffer.at_end?
21
+ node[0]
22
+ else
23
+ head = buffer.head
24
+ if subtree = node[head]
25
+ wrap(subtree).find(buffer.rest, head == delimiter)
26
+ elsif new_word
27
+ node[0]
28
+ end
29
+ end
30
+ end
31
+
32
+ private
33
+ def wrap(node)
34
+ NodeProxy.new(node, delimiter)
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ class NameFinder
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,75 @@
1
+ require "name_finder/version"
2
+ require "name_finder/node_proxy"
3
+ require "name_finder/buffer"
4
+ require "set"
5
+
6
+ # Find names from a know list in a text, taking account of names that may
7
+ # overlap. For example, Waterloo and Waterloo East are separate stations;
8
+ # NameFinder, knowing both, will not give a false match for Waterloo in a text
9
+ # that mentions Waterloo East.
10
+ #
11
+ class NameFinder
12
+
13
+ # Initialize a new NameFinder. tree, if supplied, should be the data
14
+ # generated by the export method.
15
+ #
16
+ def initialize(tree={})
17
+ @tree = tree
18
+ @root = NodeProxy.new(tree, delimiter)
19
+ end
20
+
21
+ attr_reader :root
22
+
23
+ # Add a term to NameFinder's dictionary
24
+ #
25
+ def add(term)
26
+ root.add Buffer.new(normalize(term) + delimiter), term
27
+ end
28
+
29
+ # Find the first name from the dictionary in haystack
30
+ #
31
+ def find_in(haystack)
32
+ find(haystack) do |found|
33
+ return found
34
+ end
35
+ nil
36
+ end
37
+
38
+ # Find all names from the dictionary in haystack.
39
+ #
40
+ def find_all_in(haystack)
41
+ Set.new.tap { |all|
42
+ find(haystack) do |found|
43
+ all << found
44
+ end
45
+ }.to_a
46
+ end
47
+
48
+ # Export the tree of the current dictionary for later re-importing.
49
+ #
50
+ def export
51
+ @tree
52
+ end
53
+
54
+ private
55
+ def find(haystack)
56
+ remaining = Buffer.new(normalize(haystack) + delimiter)
57
+ while !remaining.at_end?
58
+ found = root.find(remaining)
59
+ if found
60
+ yield found
61
+ remaining = remaining.advance_by(found.length)
62
+ else
63
+ remaining = remaining.advance_past(delimiter)
64
+ end
65
+ end
66
+ end
67
+
68
+ def normalize(term)
69
+ term.downcase.gsub(/[^a-z]+/, delimiter)
70
+ end
71
+
72
+ def delimiter
73
+ " "
74
+ end
75
+ end
data/perf/benchmark.rb ADDED
@@ -0,0 +1,8 @@
1
+ require File.expand_path("../performance", __FILE__)
2
+ require "benchmark"
3
+
4
+ job = NameFinderJob.new
5
+
6
+ Benchmark.benchmark do |b|
7
+ b.report("find_all"){ job.run }
8
+ end
@@ -0,0 +1,27 @@
1
+ lib_path = File.expand_path("../../lib", __FILE__)
2
+ $:.unshift lib_path unless $:.include?(lib_path)
3
+ require "name_finder"
4
+
5
+ class NameFinderJob
6
+ SOURCE_PATH = File.expand_path("../sample.txt", __FILE__)
7
+
8
+ NAMES = [
9
+ "Mr Bennet",
10
+ "Mrs Bennet",
11
+ "Miss Bingley",
12
+ "Elizabeth",
13
+ "Luther Blissett"
14
+ ]
15
+
16
+ EXPECTED = ["Mr Bennet", "Elizabeth", "Mrs Bennet", "Miss Bingley"]
17
+
18
+ def run
19
+ source = File.read(SOURCE_PATH)
20
+ nf = NameFinder.new
21
+ NAMES.each do |name|
22
+ nf.add name
23
+ end
24
+ result = nf.find_all_in(source)
25
+ raise "Unexpected output: #{result.inspect}" unless result == EXPECTED
26
+ end
27
+ end
@@ -0,0 +1,89 @@
1
+ require File.expand_path("../test_helper", __FILE__)
2
+ require "name_finder"
3
+
4
+ describe NameFinder do
5
+
6
+ subject { NameFinder.new }
7
+
8
+ describe "find_in" do
9
+ it "should find an exact match" do
10
+ subject.add "aa bb"
11
+ subject.find_in("aa bb").must_equal "aa bb"
12
+ end
13
+
14
+ it "should be case insensitive and case preserving" do
15
+ subject.add "Aa bb"
16
+ subject.find_in("AA BB").must_equal "Aa bb"
17
+ end
18
+
19
+ it "should find a substring match with text before" do
20
+ subject.add "aa bb"
21
+ subject.find_in("xx aa bb").must_equal "aa bb"
22
+ end
23
+
24
+ it "should find a substring match with text after" do
25
+ subject.add "aa bb"
26
+ subject.find_in("aa bb xx").must_equal "aa bb"
27
+ end
28
+
29
+ it "should find a substring match with text before and after" do
30
+ subject.add "aa bb"
31
+ subject.find_in("xx aa bb xx").must_equal "aa bb"
32
+ end
33
+
34
+ it "should return nil for no match" do
35
+ subject.find_in("aa").must_be_nil
36
+ end
37
+
38
+ it "should not find a substring that does not end on a word boundary" do
39
+ subject.add "aa bb"
40
+ subject.find_in("aa bbb").must_be_nil
41
+ end
42
+
43
+ it "should not find a substring that does not begin on a word boundary" do
44
+ subject.add "aa bb"
45
+ subject.find_in("aaa bb").must_be_nil
46
+ end
47
+
48
+ it "should find longest exact match" do
49
+ subject.add "aa"
50
+ subject.add "aa bb"
51
+ subject.add "aa bbb"
52
+ subject.add "aa bbbb"
53
+ subject.find_in("xx aa bbb xx").must_equal "aa bbb"
54
+ end
55
+
56
+ it "should find without regard to punctuation" do
57
+ subject.add "Mr Ee"
58
+ subject.find_in("Mr. Ee").must_equal "Mr Ee"
59
+ end
60
+ end
61
+
62
+ describe "find_all_in" do
63
+ it "should find multiple simple matches" do
64
+ subject.add "aa bb"
65
+ subject.add "cc dd"
66
+ subject.find_all_in("aa bb cc dd").must_equal ["aa bb", "cc dd"]
67
+ end
68
+
69
+ it "should not find partially occluded matches" do
70
+ subject.add "aa bb"
71
+ subject.add "bb"
72
+ subject.add "cc dd"
73
+ subject.find_all_in("aa bb cc dd").must_equal ["aa bb", "cc dd"]
74
+ end
75
+
76
+ it "should find each match only once" do
77
+ subject.add "aa bb"
78
+ subject.add "cc dd"
79
+ subject.find_all_in("aa bb cc dd aa bb cc dd").must_equal ["aa bb", "cc dd"]
80
+ end
81
+ end
82
+
83
+ it "should export and import tree" do
84
+ subject.add "test data"
85
+ export = subject.export
86
+ nf2 = NameFinder.new(export)
87
+ nf2.find_in("test data").must_equal "test data"
88
+ end
89
+ end
@@ -0,0 +1,4 @@
1
+ lib_path = File.expand_path("../../lib", __FILE__)
2
+ $:.unshift lib_path unless $:.include?(lib_path)
3
+ require "minitest/spec"
4
+ require "minitest/autorun"
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: name_finder
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.0
6
+ platform: ruby
7
+ authors:
8
+ - Paul Battley
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-08-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ type: :development
16
+ version_requirements: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - ! '>='
19
+ - !ruby/object:Gem::Version
20
+ version: '0'
21
+ none: false
22
+ prerelease: false
23
+ name: rake
24
+ requirement: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ! '>='
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ none: false
30
+ description: Find matching names in text, taking account of names that overlap but
31
+ are different (Waterloo and Waterloo East stations, for example).
32
+ email: pbattley@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files:
36
+ - COPYING.txt
37
+ - README.md
38
+ files:
39
+ - lib/name_finder/version.rb
40
+ - lib/name_finder/buffer.rb
41
+ - lib/name_finder/node_proxy.rb
42
+ - lib/name_finder.rb
43
+ - test/test_helper.rb
44
+ - test/name_finder_test.rb
45
+ - perf/benchmark.rb
46
+ - perf/performance.rb
47
+ - COPYING.txt
48
+ - README.md
49
+ homepage: https://github.com/threedaymonk/name_finder
50
+ licenses: []
51
+ post_install_message:
52
+ rdoc_options: []
53
+ require_paths:
54
+ - lib
55
+ required_ruby_version: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ! '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ none: false
61
+ required_rubygems_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ none: false
67
+ requirements: []
68
+ rubyforge_project:
69
+ rubygems_version: 1.8.25
70
+ signing_key:
71
+ specification_version: 3
72
+ summary: Find matching names in text
73
+ test_files:
74
+ - test/name_finder_test.rb