RubyGems - list_matcher - Versions diffs - 1.0.0 - Mend

list_matcher 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/list_matcher.gemspec ADDED Viewed

@@ -0,0 +1,23 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'list_matcher/version'
+Gem::Specification.new do |spec|
+  spec.name          = "list_matcher"
+  spec.version       = ListMatcher::VERSION
+  spec.authors       = ["dfhoughton"]
+  spec.email         = ["dfhoughton@gmail.com"]
+  spec.summary       = %q{List::Matcher automates the generation of efficient regular expressions.}
+  spec.description   = spec.summary
+  spec.homepage      = "https://github.com/dfhoughton/list_matcher"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 1.7"
+  spec.add_development_dependency "rake", "~> 10.0"
+end

data/test/basic_test.rb ADDED Viewed

@@ -0,0 +1,248 @@
+require "minitest/autorun"
+require "list_matcher"
+class BasicTest < Minitest::Test
+  def test_simple
+    words = %w(cat dog camel)
+    rx = List::Matcher.pattern words
+    rx = Regexp.new rx
+    words.each do |w|
+      assert rx === w
+    end
+  end
+  def test_word_chars
+    word = (1..255).map(&:chr).select{ |c| /\w/ === c }
+    chars = word + ['+']
+    rx = List::Matcher.pattern chars
+    assert_equal '[+\w]', rx
+    rx = Regexp.new rx
+    chars.each do |c|
+      assert rx === c
+    end
+    chars = word + ['@']
+    rx = List::Matcher.pattern chars
+    assert_equal '[@\w]', rx
+    rx = Regexp.new rx
+    chars.each do |c|
+      assert rx === c
+    end
+  end
+  def test_word_chars_case_insensitive
+    word = (1..255).map(&:chr).select{ |c| /\w/ === c }
+    chars = word + ['+']
+    rx = List::Matcher.pattern chars, case_insensitive: true
+    assert_equal '(?i:[+\w])', rx
+    rx = Regexp.new rx
+    chars.each do |c|
+      assert rx === c
+    end
+  end
+  def test_num_chars
+    words = (0..9).map(&:to_s)
+    rx = List::Matcher.pattern words
+    assert_equal '\d', rx
+    rx = Regexp.new rx
+    words.each do |w|
+      assert rx === w
+    end
+  end
+  def test_space_chars
+    words = (1..255).map(&:chr).select{ |c| c =~ /\s/ }
+    rx = List::Matcher.pattern words
+    assert_equal '\s', rx
+    rx = Regexp.new rx
+    words.each do |w|
+      assert rx === w
+    end
+  end
+  def test_bounds
+    words = %w(cat dog)
+    rx = List::Matcher.pattern words, bound: true
+    assert_equal '(?:\b(?:cat|dog)\b)', rx
+    rx = Regexp.new rx
+    words.each do |w|
+      assert rx === w
+    end
+  end
+  def test_repeats
+    rx = List::Matcher.pattern %w(aaaaaaaaaa)
+    assert_equal '(?:a{10})', rx
+    rx = List::Matcher.pattern %w(bbbaaaaaaaaaabbbaaaaaaaaaa)
+    assert_equal '(?:(?:bbba{10}){2})', rx
+  end
+  def test_opt_suffix
+    words = %w(the them)
+    rx = List::Matcher.pattern words
+    assert_equal '(?:them?)', rx
+    rx = Regexp.new rx
+    words.each do |w|
+      assert rx === w
+    end
+  end
+  def test_opt_prefix
+    words = %w(at cat)
+    rx = List::Matcher.pattern words
+    assert_equal '(?:c?at)', rx
+    rx = Regexp.new rx
+    words.each do |w|
+      assert rx === w
+    end
+  end
+  def test_symbols_string
+    words = ['cat dog']
+    rx = List::Matcher.pattern words, symbols: { ' ' => '\s++' }
+    assert_equal '(?:cat\s++dog)', rx
+    rx = Regexp.new rx
+    words.each do |w|
+      assert rx === w
+    end
+  end
+  def test_symbols_rx
+    words = %w(year year2000 year1999)
+    rx = List::Matcher.pattern words, symbols: { /(?<!\d)\d{4}(?!\d)/ => nil }
+    assert_equal '(?:year(?-mix:(?<!\d)\d{4}(?!\d))?)', rx
+    rx = Regexp.new rx
+    words.each do |w|
+      assert rx === w
+    end
+  end
+  def test_fancy_rx
+    words = ['   cat   dog  ']
+    good = ['the cat  dog is an odd beast']
+    bad = ['the catdog is an odd beast', 'the cat doggy is an odd beast', 'the scat dog is an odd beast']
+    rx = List::Matcher.pattern words, bound: true, normalize_whitespace: true
+    assert_equal '(?:\bcat\s++dog\b)', rx
+    rx = Regexp.new rx
+    assert good.all?{ |w| rx === w }, 'not bothered by odd space'
+    assert bad.none?{ |w| rx === w }, 'needs interior space and boundaries'
+  end
+  def test_symbols_borders
+    words = (1..31).to_a
+    rx = List::Matcher.pattern words, bound: { test: /\d/, left: '(?<!\d)', right: '(?!\d)' }
+    rx = Regexp.new rx
+    good = words.map{ |n| "a#{n}b" }
+    bad = words.map{ |n| "0#{n}0" }
+    assert good.all?{ |w| rx === w }
+    assert bad.none?{ |w| rx === w }
+  end
+  def test_string_bound
+    rx = List::Matcher.pattern ['cat'], bound: :string
+    assert_equal '(?:\Acat\z)', rx
+    rx = Regexp.new rx
+    assert rx === 'cat', 'matches whole string'
+    assert "cat\ndog" !~ rx, 'line breaks do not suffice'
+    assert ' cat ' !~ rx, 'word boundaries do not suffice'
+  end
+  def test_line_bound
+    rx = List::Matcher.pattern ['cat'], bound: :line
+    assert_equal '(?:^cat$)', rx
+    rx = Regexp.new rx
+    assert rx === 'cat', 'matches whole string'
+    assert rx === "cat\ndog", 'line breaks suffice'
+    assert ' cat ' !~ rx, 'word boundaries do not suffice'
+  end
+  def test_dup_atomic
+    m = List::Matcher.new atomic: true
+    rx = m.pattern %w( cat dog ), atomic: false
+    assert_equal "cat|dog", rx
+  end
+  def test_dup_backtracking
+    m = List::Matcher.new backtracking: true
+    rx = m.pattern %w( cat dog ), backtracking: false
+    assert_equal "(?>cat|dog)", rx
+  end
+  def test_dup_bound
+    m = List::Matcher.new bound: false, atomic: false
+    rx = m.pattern %w( cat dog ), bound: true
+    assert_equal '\b(?:cat|dog)\b', rx
+  end
+  def test_dup_bound_string
+    m = List::Matcher.new bound: false, atomic: false
+    rx = m.pattern %w( cat dog ), bound: :string
+    assert_equal '\A(?:cat|dog)\z', rx
+  end
+  def test_dup_bound_line
+    m = List::Matcher.new bound: false, atomic: false
+    rx = m.pattern %w( cat dog ), bound: :line
+    assert_equal '^(?:cat|dog)$', rx
+  end
+  def test_dup_bound_fancy
+    m = List::Matcher.new bound: false, atomic: false
+    rx = m.pattern %w( 1 2 ), bound: { test: /\d/, left: '(?<!\d)', right: '(?!\d)' }
+    assert_equal '(?<!\d)[12](?!\d)', rx
+  end
+  def test_dup_strip
+    m = List::Matcher.new atomic: false
+    rx = m.pattern [%( cat )], strip: true
+    assert_equal 'cat', rx
+  end
+  def test_dup_case_insensitive
+    m = List::Matcher.new
+    rx = m.pattern %w(cat), case_insensitive: true
+    assert_equal '(?i:cat)', rx
+  end
+  def test_dup_normalize_whitespace
+    m = List::Matcher.new atomic: false
+    rx = m.pattern ['  cat     dog  '], normalize_whitespace: true
+    assert_equal 'cat\s++dog', rx
+  end
+  def test_dup_symbols
+    m = List::Matcher.new atomic: false
+    rx = m.pattern ['cat dog'], symbols: { ' ' => '\s++' }
+    assert_equal 'cat\s++dog', rx
+  end
+  def test_multiline
+    rx = List::Matcher.pattern %w( cat dog ), multiline: true
+    assert_equal '(?m:cat|dog)', rx
+  end
+  def test_dup_multiline
+    m = List::Matcher.new atomic: false
+    rx = m.pattern %w( cat dog ), multiline: true
+    assert_equal '(?m:cat|dog)', rx
+  end
+  def test_name
+    m = List::Matcher.new name: :foo
+    rx = m.pattern %w( cat dog )
+    assert_equal '(?<foo>cat|dog)', rx
+  end
+  def test_vetting_good
+    List::Matcher.pattern %w(cat), symbols: { foo: 'bar' }, vet: true
+    assert true, 'good regexen are vetted appropriately'
+  end
+  def test_vetting_bad
+    assert_raises SyntaxError do
+      List::Matcher.pattern %w(cat), symbols: { foo: '+' }, vet: true
+    end
+  end
+end

data/test/benchmarks.rb ADDED Viewed

@@ -0,0 +1,149 @@
+require 'list_matcher'
+require 'benchmark/ips'
+size = 100
+magnitudes = 3
+creation_iterations = 1000
+def words(n, char_range, size_range, avoid=Set.new)
+  set = Set.new
+  while set.size < n do
+    w = (1..rand(size_range)).map{ rand(char_range).chr }.join
+    next if avoid.include? w
+    set << w
+  end
+  set.to_a
+end
+def simple_rx(words)
+  rx = words.join "|"
+  Regexp.new "\\A(?>#{rx})\\z"
+end
+def list_rx(words)
+  List::Matcher.rx words, bound: :string
+end
+puts "RANDOM WORDS, VARIABLE LENGTH\n"
+magnitudes.times do
+  good = words size, 97..122, 10..15
+  bad  = words size, 97..122, 10..15, good
+  set = Set[*good]
+  rx  = simple_rx good
+  lrx = list_rx good
+  puts "\nnumber of words: #{size}"
+  Benchmark.ips do |bm|
+    bm.report('simple rx good') do
+      good.each{ |w| rx === w }
+    end
+    bm.report('List::Matcher good') do
+      good.each{ |w| lrx === w }
+    end
+    bm.report('set good') do
+      good.each{ |w| set.include? w }
+    end
+    bm.report('list good') do
+      good.each{ |w| good.include? w }
+    end
+    bm.compare!
+  end
+  Benchmark.ips do |bm|
+    bm.report('simple rx bad') do
+      bad.each{ |w| rx === w }
+    end
+    bm.report('List::Matcher bad') do
+      bad.each{ |w| lrx === w }
+    end
+    bm.report('set bad') do
+      bad.each{ |w| set.include? w }
+    end
+    bm.report('list bad') do
+      bad.each{ |w| good.include? w }
+    end
+    bm.compare!
+  end
+  size *= 10
+end
+def nums(length)
+  variants length, 0..9
+end
+def alphas(length)
+  variants length, 'a'..'j'
+end
+def variants(length, range)
+  out = []
+  range = range.to_a
+  tumblers = Array.new length, 0
+  (range.size ** length).times do
+    out << tumblers.map{ |t| range[t] }.join
+    tumblers[0] += 1
+    tumblers[0] %= range.size
+    (0...length-1).each do |i|
+      if tumblers[i] == 0
+        tumblers[i + 1] += 1
+        tumblers[i + 1] %= range.size
+      else
+        break
+      end
+    end
+  end
+  out
+end
+puts "\nFIXED LENGTH, FULL RANGE\n"
+(1..4).each do |i|
+  good = nums i
+  bad  = alphas i
+  lrx = list_rx good
+  set = Set[*good]
+  rx  = simple_rx good
+  puts "\nnumber of words: #{10 ** i}; List::Matcher rx: #{lrx}"
+  Benchmark.ips do |bm|
+    bm.report('simple rx creation') do
+      creation_iterations.times{ simple_rx good }
+    end
+    bm.report('List::Matcher creation') do
+      creation_iterations.times{ simple_rx good }
+    end
+    bm.report('set creation') do
+      creation_iterations.times{ Set[*good] }
+    end
+    bm.compare!
+  end
+  Benchmark.ips do |bm|
+    bm.report('simple rx good') do
+      good.each{ |w| rx === w }
+    end
+    bm.report('List::Matcher good') do
+      good.each{ |w| lrx === w }
+    end
+    bm.report('set good') do
+      good.each{ |w| set.include? w }
+    end
+    bm.report('list good') do
+      good.each{ |w| good.include? w }
+    end
+    bm.compare!
+  end
+  Benchmark.ips do |bm|
+    bm.report('simple rx bad') do
+      bad.each{ |w| rx === w }
+    end
+    bm.report('List::Matcher bad') do
+      bad.each{ |w| lrx === w }
+    end
+    bm.report('set bad') do
+      bad.each{ |w| set.include? w }
+    end
+    bm.report('list bad') do
+      bad.each{ |w| good.include? w }
+    end
+    bm.compare!
+  end
+  size *= 10
+end

data/test/stress.rb ADDED Viewed

@@ -0,0 +1,44 @@
+require "minitest/autorun"
+require "list_matcher"
+class Stress < Minitest::Test
+  def test_simple
+    (1..10).each{ basic_test 5000, 97..122, 4..8 }
+  end
+  def test_fixed_size
+    (1..10).each{ basic_test 5000, 97..122, 8..8 }
+  end
+  def test_really_big
+    basic_test 50000, 97..122, 4..8
+  end
+  def basic_test(n, range, max)
+    words = words n, range, max
+    good = words[0...n/10]
+    bad = words[n/10..-1]
+    rx = List::Matcher.rx( good, bound: true )
+    puts good.inspect unless good.all?{ |w| rx === w }
+    good.each do |w|
+      assert rx === w, "#{w} is good for #{rx}"
+    end
+    bad.each do |w|
+      assert !( rx === w ), "#{w} is bad for #{rx}"
+    end
+  end
+  def words(n, range, max)
+    words = []
+    while words.size < n
+      words += (1..n/10).map{ random_word range, max }
+      words.uniq!
+    end
+    words[0...n]
+  end
+  def random_word(range, max)
+    (1..rand(max)).map{ rand(range).chr }.join
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,87 @@
+--- !ruby/object:Gem::Specification
+name: list_matcher
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+platform: ruby
+authors:
+- dfhoughton
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-08-16 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.7'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.7'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '10.0'
+description: List::Matcher automates the generation of efficient regular expressions.
+email:
+- dfhoughton@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- examples/date_grammar.rb
+- lib/list_matcher.rb
+- lib/list_matcher/version.rb
+- list_matcher.gemspec
+- test/basic_test.rb
+- test/benchmarks.rb
+- test/stress.rb
+homepage: https://github.com/dfhoughton/list_matcher
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: List::Matcher automates the generation of efficient regular expressions.
+test_files:
+- test/basic_test.rb
+- test/benchmarks.rb
+- test/stress.rb