RubyGems - name_finder - Versions diffs - 0.1.0 - Mend

Files changed (11) hide show

data/COPYING.txt +21 -0
data/README.md +69 -0
data/lib/name_finder/buffer.rb +43 -0
data/lib/name_finder/node_proxy.rb +37 -0
data/lib/name_finder/version.rb +3 -0
data/lib/name_finder.rb +75 -0
data/perf/benchmark.rb +8 -0
data/perf/performance.rb +27 -0
data/test/name_finder_test.rb +89 -0
data/test/test_helper.rb +4 -0
metadata +74 -0

data/COPYING.txt ADDED Viewed

@@ -0,0 +1,21 @@
+== Licence (MIT)
+Copyright (c) 2010-2013 Paul Battley
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,69 @@
+# Name Finder
+Find names from a know list in a text, taking account of names that may
+overlap. For example, Waterloo and Waterloo East are separate stations;
+NameFinder, knowing both, will not give a false match for Waterloo in a text
+that mentions Waterloo East.
+## Examples
+```ruby
+require "name_finder"
+stations = [
+  "Bermondsey",
+  "South Bermondsey",
+  "Southwark",
+  "Waterloo",
+  "Waterloo East"
+]
+nf = NameFinder.new
+stations.each do |station|
+  nf.add station
+end
+```
+It can find the best matching name even when one name is the same as part of
+another, whether they overlap at the start:
+```ruby
+nf.find_in "Change here for trains from Waterloo East"
+# => "Waterloo East"
+nf.find_in "This train terminates at Waterloo"
+# => "Waterloo"
+```
+or at the end:
+```ruby
+nf.find_in "Escalator closed at Bermondsey station"
+# => "Bermondsey"
+nf.find_in "Use South Bermondsey station for Millwall FC"
+# => "South Bermondsey"
+```
+It can also find all the matching names, without false positives for names
+that are part of a longer name:
+```ruby
+nf.find_all_in "South Bermondsey and Waterloo East"
+# => ["South Bermondsey", "Waterloo East"]
+```
+Names that are part of a longer name are still found when listed separately,
+however:
+```ruby
+nf.find_all_in "South Bermondsey and Bermondsey"
+# => ["South Bermondsey", "Bermondsey"]
+```
+## Limitations
+The present implementation handles only the letters A-Z. This can be customised
+by subclassing `NameFinder` and changing the implementation of `normalize`.
+The `normalize` method must use the same delimiter between words as is returned
+by the `delimiter` method (normally a single space).

data/lib/name_finder/buffer.rb ADDED Viewed

@@ -0,0 +1,43 @@
+class NameFinder
+  class Buffer
+    def initialize(string, position = 0)
+      @string = string
+      @position = position
+      @length = string.length
+    end
+    def advance_by(n)
+      new(@position + n)
+    end
+    def advance_past(delimiter)
+      p = (@position ... @length).find { |i| @string[i] == delimiter }
+      if p
+        new(p + 1)
+      else
+        new(@length)
+      end
+    end
+    def at_end?
+      @position >= @length
+    end
+    def head
+      @string[@position, 1]
+    end
+    def rest
+      new(@position + 1)
+    end
+    def inspect(*args)
+      "<Buffer:#{@string[@position .. -1].inspect}>"
+    end
+  private
+    def new(position)
+      Buffer.new(@string, position)
+    end
+  end
+end

data/lib/name_finder/node_proxy.rb ADDED Viewed

@@ -0,0 +1,37 @@
+class NameFinder
+  class NodeProxy
+    def initialize(node, delimiter)
+      @node = node
+      @delimiter = delimiter
+    end
+    attr_reader :node, :delimiter
+    def add(buffer, term)
+      if buffer.at_end?
+        node[0] = term
+      else
+        subtree = node[buffer.head] ||= {}
+        wrap(subtree).add buffer.rest, term
+      end
+    end
+    def find(buffer, new_word=false)
+      if buffer.at_end?
+        node[0]
+      else
+        head = buffer.head
+        if subtree = node[head]
+          wrap(subtree).find(buffer.rest, head == delimiter)
+        elsif new_word
+          node[0]
+        end
+      end
+    end
+  private
+    def wrap(node)
+      NodeProxy.new(node, delimiter)
+    end
+  end
+end

data/lib/name_finder/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+class NameFinder
+  VERSION = "0.1.0"
+end

data/lib/name_finder.rb ADDED Viewed

@@ -0,0 +1,75 @@
+require "name_finder/version"
+require "name_finder/node_proxy"
+require "name_finder/buffer"
+require "set"
+# Find names from a know list in a text, taking account of names that may
+# overlap. For example, Waterloo and Waterloo East are separate stations;
+# NameFinder, knowing both, will not give a false match for Waterloo in a text
+# that mentions Waterloo East.
+#
+class NameFinder
+  # Initialize a new NameFinder. tree, if supplied, should be the data
+  # generated by the export method.
+  #
+  def initialize(tree={})
+    @tree = tree
+    @root = NodeProxy.new(tree, delimiter)
+  end
+  attr_reader :root
+  # Add a term to NameFinder's dictionary
+  #
+  def add(term)
+    root.add Buffer.new(normalize(term) + delimiter), term
+  end
+  # Find the first name from the dictionary in haystack
+  #
+  def find_in(haystack)
+    find(haystack) do |found|
+      return found
+    end
+    nil
+  end
+  # Find all names from the dictionary in haystack.
+  #
+  def find_all_in(haystack)
+    Set.new.tap { |all|
+      find(haystack) do |found|
+        all << found
+      end
+    }.to_a
+  end
+  # Export the tree of the current dictionary for later re-importing.
+  #
+  def export
+    @tree
+  end
+private
+  def find(haystack)
+    remaining = Buffer.new(normalize(haystack) + delimiter)
+    while !remaining.at_end?
+      found = root.find(remaining)
+      if found
+        yield found
+        remaining = remaining.advance_by(found.length)
+      else
+        remaining = remaining.advance_past(delimiter)
+      end
+    end
+  end
+  def normalize(term)
+    term.downcase.gsub(/[^a-z]+/, delimiter)
+  end
+  def delimiter
+    " "
+  end
+end

data/perf/benchmark.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require File.expand_path("../performance", __FILE__)
+require "benchmark"
+job = NameFinderJob.new
+Benchmark.benchmark do |b|
+  b.report("find_all"){ job.run }
+end

data/perf/performance.rb ADDED Viewed

@@ -0,0 +1,27 @@
+lib_path = File.expand_path("../../lib", __FILE__)
+$:.unshift lib_path unless $:.include?(lib_path)
+require "name_finder"
+class NameFinderJob
+  SOURCE_PATH = File.expand_path("../sample.txt", __FILE__)
+  NAMES = [
+    "Mr Bennet",
+    "Mrs Bennet",
+    "Miss Bingley",
+    "Elizabeth",
+    "Luther Blissett"
+  ]
+  EXPECTED = ["Mr Bennet", "Elizabeth", "Mrs Bennet", "Miss Bingley"]
+  def run
+    source = File.read(SOURCE_PATH)
+    nf = NameFinder.new
+    NAMES.each do |name|
+      nf.add name
+    end
+    result = nf.find_all_in(source)
+    raise "Unexpected output: #{result.inspect}" unless result == EXPECTED
+  end
+end

data/test/name_finder_test.rb ADDED Viewed

@@ -0,0 +1,89 @@
+require File.expand_path("../test_helper", __FILE__)
+require "name_finder"
+describe NameFinder do
+  subject { NameFinder.new }
+  describe "find_in" do
+    it "should find an exact match" do
+      subject.add "aa bb"
+      subject.find_in("aa bb").must_equal "aa bb"
+    end
+    it "should be case insensitive and case preserving" do
+      subject.add "Aa bb"
+      subject.find_in("AA BB").must_equal "Aa bb"
+    end
+    it "should find a substring match with text before" do
+      subject.add "aa bb"
+      subject.find_in("xx aa bb").must_equal "aa bb"
+    end
+    it "should find a substring match with text after" do
+      subject.add "aa bb"
+      subject.find_in("aa bb xx").must_equal "aa bb"
+    end
+    it "should find a substring match with text before and after" do
+      subject.add "aa bb"
+      subject.find_in("xx aa bb xx").must_equal "aa bb"
+    end
+    it "should return nil for no match" do
+      subject.find_in("aa").must_be_nil
+    end
+    it "should not find a substring that does not end on a word boundary" do
+      subject.add "aa bb"
+      subject.find_in("aa bbb").must_be_nil
+    end
+    it "should not find a substring that does not begin on a word boundary" do
+      subject.add "aa bb"
+      subject.find_in("aaa bb").must_be_nil
+    end
+    it "should find longest exact match" do
+      subject.add "aa"
+      subject.add "aa bb"
+      subject.add "aa bbb"
+      subject.add "aa bbbb"
+      subject.find_in("xx aa bbb xx").must_equal "aa bbb"
+    end
+    it "should find without regard to punctuation" do
+      subject.add "Mr Ee"
+      subject.find_in("Mr. Ee").must_equal "Mr Ee"
+    end
+  end
+  describe "find_all_in" do
+    it "should find multiple simple matches" do
+      subject.add "aa bb"
+      subject.add "cc dd"
+      subject.find_all_in("aa bb cc dd").must_equal ["aa bb", "cc dd"]
+    end
+    it "should not find partially occluded matches" do
+      subject.add "aa bb"
+      subject.add "bb"
+      subject.add "cc dd"
+      subject.find_all_in("aa bb cc dd").must_equal ["aa bb", "cc dd"]
+    end
+    it "should find each match only once" do
+      subject.add "aa bb"
+      subject.add "cc dd"
+      subject.find_all_in("aa bb cc dd aa bb cc dd").must_equal ["aa bb", "cc dd"]
+    end
+  end
+  it "should export and import tree" do
+    subject.add "test data"
+    export = subject.export
+    nf2 = NameFinder.new(export)
+    nf2.find_in("test data").must_equal "test data"
+  end
+end

data/test/test_helper.rb ADDED Viewed

@@ -0,0 +1,4 @@
+lib_path = File.expand_path("../../lib", __FILE__)
+$:.unshift lib_path unless $:.include?(lib_path)
+require "minitest/spec"
+require "minitest/autorun"

metadata ADDED Viewed

@@ -0,0 +1,74 @@
+--- !ruby/object:Gem::Specification
+name: name_finder
+version: !ruby/object:Gem::Version
+  prerelease:
+  version: 0.1.0
+platform: ruby
+authors:
+- Paul Battley
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-08-04 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  type: :development
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+    none: false
+  prerelease: false
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+    none: false
+description: Find matching names in text, taking account of names that overlap but
+  are different (Waterloo and Waterloo East stations, for example).
+email: pbattley@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- COPYING.txt
+- README.md
+files:
+- lib/name_finder/version.rb
+- lib/name_finder/buffer.rb
+- lib/name_finder/node_proxy.rb
+- lib/name_finder.rb
+- test/test_helper.rb
+- test/name_finder_test.rb
+- perf/benchmark.rb
+- perf/performance.rb
+- COPYING.txt
+- README.md
+homepage: https://github.com/threedaymonk/name_finder
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+  none: false
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+  none: false
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.25
+signing_key:
+specification_version: 3
+summary: Find matching names in text
+test_files:
+- test/name_finder_test.rb

name_finder 0.1.0