houdinirx 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README +39 -0
- data/Rakefile +1 -0
- data/houdinirx.gemspec +20 -0
- data/lib/houdinirx/match_result.rb +27 -0
- data/lib/houdinirx/regex_helper.rb +54 -0
- data/lib/houdinirx/string.rb +13 -0
- data/lib/houdinirx/version.rb +3 -0
- data/lib/houdinirx.rb +25 -0
- metadata +77 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
A ruby DSL for building long complex regular expressions used frequently across large data extraction projects.
|
2
|
+
|
3
|
+
This is the simplest possible example, but given the following text:
|
4
|
+
t = "$10.99 (555) 444-3322 Boston, MA"
|
5
|
+
|
6
|
+
Instead of writing a regular expression like the following:
|
7
|
+
data = t.match(/\$(\d+[,\d]*\.\d+)\s*(\(\d{3}\)\s\d{3}-\d{4})\s*(\w+),\s(\w+)/)
|
8
|
+
|
9
|
+
and then retrieving the data with match result indices:
|
10
|
+
data[1] => "10.99"
|
11
|
+
data[2] => "(555) 444-3322"
|
12
|
+
data[3] => "Boston"
|
13
|
+
data[4] => "MA"
|
14
|
+
|
15
|
+
We can use Houdini to build the regular expression in easy to read and reusable pieces
|
16
|
+
|
17
|
+
Define a regular expression for use across your project:
|
18
|
+
Houdini.define(:word, /\w+/)
|
19
|
+
|
20
|
+
Call hmatch or hscan on a string and build your expression pieces with the r() method by defining the expression inline or using a pre-defined expression. Then build your match results with the m( ) method.
|
21
|
+
|
22
|
+
data = t.hmatch do
|
23
|
+
r("amount", /\d+[,\d]*\.\d+/)
|
24
|
+
r("phone_number", /\(\d{3}\)\s+\d{3}-\d{4}/)
|
25
|
+
r("city", :word)
|
26
|
+
r("state", :word)
|
27
|
+
m("\\$(amount) (phone_number) (city), (state)")
|
28
|
+
end
|
29
|
+
|
30
|
+
Now you can access the data as methods on the resulting object:
|
31
|
+
|
32
|
+
data.amount => "10.99"
|
33
|
+
data.phone_number => "(555) 444-3322"
|
34
|
+
data.city => "Boston"
|
35
|
+
data.state => "MA"
|
36
|
+
|
37
|
+
or access the original MatchData object
|
38
|
+
|
39
|
+
data.match => #<MatchData "$10.99 (555) 444-3322 Boston, MA" 1:"10.99" 2:"(555) 444-3322" 3:"Boston" 4:"MA">
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
data/houdinirx.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "houdinirx/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "houdinirx"
|
7
|
+
s.version = Houdini::VERSION
|
8
|
+
s.authors = ["Eric Campbell", "Duncan Mak"]
|
9
|
+
s.email = ["ericcampbell59@gmail.com", "duncanmak@gmail.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{A Ruby DSL for building regular expressions}
|
12
|
+
s.description = %q{A ruby DSL for building long complex regular expressions used frequently across large data extraction projects}
|
13
|
+
|
14
|
+
s.rubyforge_project = "houdinirx"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Houdini
|
2
|
+
class MatchResult
|
3
|
+
|
4
|
+
attr_accessor :match, :captures
|
5
|
+
|
6
|
+
def initialize(match, captures)
|
7
|
+
@match = match
|
8
|
+
|
9
|
+
raise RuntimeError, "Regex failed" if @match.nil?
|
10
|
+
@captures = captures.collect {|c| c.to_sym }
|
11
|
+
end
|
12
|
+
|
13
|
+
def method_missing(method, *args, &block)
|
14
|
+
if @captures.include?(method)
|
15
|
+
index = @captures.index(method)
|
16
|
+
if @match.class == MatchData
|
17
|
+
@match.captures[index]
|
18
|
+
else
|
19
|
+
@match[index]
|
20
|
+
end
|
21
|
+
else
|
22
|
+
raise RuntimeError, "#{method} not captured"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Houdini
|
2
|
+
class RegexHelper
|
3
|
+
attr_accessor :expressions, :match_string, :captures, :regexp
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@expressions = []
|
7
|
+
@captures = []
|
8
|
+
@regexp = nil
|
9
|
+
end
|
10
|
+
|
11
|
+
def r(*args)
|
12
|
+
if args.size == 2
|
13
|
+
@expressions << [args[0], args[1]]
|
14
|
+
else
|
15
|
+
expression = args.pop
|
16
|
+
args.each do |name|
|
17
|
+
@expressions << [name, expression]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def m(match_str)
|
23
|
+
@match_string = match_str
|
24
|
+
end
|
25
|
+
|
26
|
+
def match(text, options=Regexp::IGNORECASE)
|
27
|
+
@regexp = match_string.gsub(/\s/, "\\s*")
|
28
|
+
replace_expressions!(@regexp)
|
29
|
+
r = Regexp.new(@regexp, options)
|
30
|
+
puts r.inspect
|
31
|
+
Houdini::MatchResult.new(text.match(r), @captures)
|
32
|
+
end
|
33
|
+
|
34
|
+
def scan(text, options=Regexp::IGNORECASE)
|
35
|
+
@regexp = match_string.gsub(/\s/, "\\s*")
|
36
|
+
replace_expressions!(@regexp)
|
37
|
+
r = Regexp.new(@regexp, options)
|
38
|
+
scans = text.scan(r)
|
39
|
+
scans.collect {|scan| Houdini::MatchResult.new(scan, @captures) }
|
40
|
+
end
|
41
|
+
|
42
|
+
protected
|
43
|
+
def replace_expressions!(regex)
|
44
|
+
@expressions.each do |name, r|
|
45
|
+
if regex =~ /\(#{name}\)/
|
46
|
+
@captures << name
|
47
|
+
end
|
48
|
+
r = Houdini[r].source if r.class == Symbol
|
49
|
+
r = r.source if r.class == Regexp
|
50
|
+
regex.gsub!(name, r)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class String
|
2
|
+
def hmatch(options=nil, &block)
|
3
|
+
regex_helper = Houdini::RegexHelper.new()
|
4
|
+
regex_helper.instance_eval(&block)
|
5
|
+
regex_helper.match(self)
|
6
|
+
end
|
7
|
+
|
8
|
+
def hscan(options=nil, &block)
|
9
|
+
regex_helper = Houdini::RegexHelper.new()
|
10
|
+
regex_helper.instance_eval(&block)
|
11
|
+
regex_helper.scan(self)
|
12
|
+
end
|
13
|
+
end
|
data/lib/houdinirx.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
require "houdinirx/version"
|
5
|
+
require "houdinirx/string"
|
6
|
+
require "houdinirx/regex_helper"
|
7
|
+
require "houdinirx/match_result"
|
8
|
+
|
9
|
+
module Houdini
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def [](name)
|
13
|
+
definitions[name]
|
14
|
+
end
|
15
|
+
|
16
|
+
def define(name, regex)
|
17
|
+
definitions[name] = regex
|
18
|
+
end
|
19
|
+
|
20
|
+
def definitions
|
21
|
+
@@def ||= {}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: houdinirx
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Eric Campbell
|
14
|
+
- Duncan Mak
|
15
|
+
autorequire:
|
16
|
+
bindir: bin
|
17
|
+
cert_chain: []
|
18
|
+
|
19
|
+
date: 2011-08-01 00:00:00 Z
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: A ruby DSL for building long complex regular expressions used frequently across large data extraction projects
|
23
|
+
email:
|
24
|
+
- ericcampbell59@gmail.com
|
25
|
+
- duncanmak@gmail.com
|
26
|
+
executables: []
|
27
|
+
|
28
|
+
extensions: []
|
29
|
+
|
30
|
+
extra_rdoc_files: []
|
31
|
+
|
32
|
+
files:
|
33
|
+
- .gitignore
|
34
|
+
- Gemfile
|
35
|
+
- README
|
36
|
+
- Rakefile
|
37
|
+
- houdinirx.gemspec
|
38
|
+
- lib/houdinirx.rb
|
39
|
+
- lib/houdinirx/match_result.rb
|
40
|
+
- lib/houdinirx/regex_helper.rb
|
41
|
+
- lib/houdinirx/string.rb
|
42
|
+
- lib/houdinirx/version.rb
|
43
|
+
homepage: ""
|
44
|
+
licenses: []
|
45
|
+
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
hash: 3
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
68
|
+
version: "0"
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
rubyforge_project: houdinirx
|
72
|
+
rubygems_version: 1.8.5
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: A Ruby DSL for building regular expressions
|
76
|
+
test_files: []
|
77
|
+
|