houdinirx 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README +39 -0
- data/Rakefile +1 -0
- data/houdinirx.gemspec +20 -0
- data/lib/houdinirx/match_result.rb +27 -0
- data/lib/houdinirx/regex_helper.rb +54 -0
- data/lib/houdinirx/string.rb +13 -0
- data/lib/houdinirx/version.rb +3 -0
- data/lib/houdinirx.rb +25 -0
- metadata +77 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
A ruby DSL for building long complex regular expressions used frequently across large data extraction projects.
|
2
|
+
|
3
|
+
This is the simplest possible example, but given the following text:
|
4
|
+
t = "$10.99 (555) 444-3322 Boston, MA"
|
5
|
+
|
6
|
+
Instead of writing a regular expression like the following:
|
7
|
+
data = t.match(/\$(\d+[,\d]*\.\d+)\s*(\(\d{3}\)\s\d{3}-\d{4})\s*(\w+),\s(\w+)/)
|
8
|
+
|
9
|
+
and then retrieving the data with match result indices:
|
10
|
+
data[1] => "10.99"
|
11
|
+
data[2] => "(555) 444-3322"
|
12
|
+
data[3] => "Boston"
|
13
|
+
data[4] => "MA"
|
14
|
+
|
15
|
+
We can use Houdini to build the regular expression in easy to read and reusable pieces
|
16
|
+
|
17
|
+
Define a regular expression for use across your project:
|
18
|
+
Houdini.define(:word, /\w+/)
|
19
|
+
|
20
|
+
Call hmatch or hscan on a string and build your expression pieces with the r() method by defining the expression inline or using a pre-defined expression. Then build your match results with the m( ) method.
|
21
|
+
|
22
|
+
data = t.hmatch do
|
23
|
+
r("amount", /\d+[,\d]*\.\d+/)
|
24
|
+
r("phone_number", /\(\d{3}\)\s+\d{3}-\d{4}/)
|
25
|
+
r("city", :word)
|
26
|
+
r("state", :word)
|
27
|
+
m("\\$(amount) (phone_number) (city), (state)")
|
28
|
+
end
|
29
|
+
|
30
|
+
Now you can access the data as methods on the resulting object:
|
31
|
+
|
32
|
+
data.amount => "10.99"
|
33
|
+
data.phone_number => "(555) 444-3322"
|
34
|
+
data.city => "Boston"
|
35
|
+
data.state => "MA"
|
36
|
+
|
37
|
+
or access the original MatchData object
|
38
|
+
|
39
|
+
data.match => #<MatchData "$10.99 (555) 444-3322 Boston, MA" 1:"10.99" 2:"(555) 444-3322" 3:"Boston" 4:"MA">
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
data/houdinirx.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "houdinirx/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "houdinirx"
|
7
|
+
s.version = Houdini::VERSION
|
8
|
+
s.authors = ["Eric Campbell", "Duncan Mak"]
|
9
|
+
s.email = ["ericcampbell59@gmail.com", "duncanmak@gmail.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{A Ruby DSL for building regular expressions}
|
12
|
+
s.description = %q{A ruby DSL for building long complex regular expressions used frequently across large data extraction projects}
|
13
|
+
|
14
|
+
s.rubyforge_project = "houdinirx"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Houdini
|
2
|
+
class MatchResult
|
3
|
+
|
4
|
+
attr_accessor :match, :captures
|
5
|
+
|
6
|
+
def initialize(match, captures)
|
7
|
+
@match = match
|
8
|
+
|
9
|
+
raise RuntimeError, "Regex failed" if @match.nil?
|
10
|
+
@captures = captures.collect {|c| c.to_sym }
|
11
|
+
end
|
12
|
+
|
13
|
+
def method_missing(method, *args, &block)
|
14
|
+
if @captures.include?(method)
|
15
|
+
index = @captures.index(method)
|
16
|
+
if @match.class == MatchData
|
17
|
+
@match.captures[index]
|
18
|
+
else
|
19
|
+
@match[index]
|
20
|
+
end
|
21
|
+
else
|
22
|
+
raise RuntimeError, "#{method} not captured"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Houdini
|
2
|
+
class RegexHelper
|
3
|
+
attr_accessor :expressions, :match_string, :captures, :regexp
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
@expressions = []
|
7
|
+
@captures = []
|
8
|
+
@regexp = nil
|
9
|
+
end
|
10
|
+
|
11
|
+
def r(*args)
|
12
|
+
if args.size == 2
|
13
|
+
@expressions << [args[0], args[1]]
|
14
|
+
else
|
15
|
+
expression = args.pop
|
16
|
+
args.each do |name|
|
17
|
+
@expressions << [name, expression]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def m(match_str)
|
23
|
+
@match_string = match_str
|
24
|
+
end
|
25
|
+
|
26
|
+
def match(text, options=Regexp::IGNORECASE)
|
27
|
+
@regexp = match_string.gsub(/\s/, "\\s*")
|
28
|
+
replace_expressions!(@regexp)
|
29
|
+
r = Regexp.new(@regexp, options)
|
30
|
+
puts r.inspect
|
31
|
+
Houdini::MatchResult.new(text.match(r), @captures)
|
32
|
+
end
|
33
|
+
|
34
|
+
def scan(text, options=Regexp::IGNORECASE)
|
35
|
+
@regexp = match_string.gsub(/\s/, "\\s*")
|
36
|
+
replace_expressions!(@regexp)
|
37
|
+
r = Regexp.new(@regexp, options)
|
38
|
+
scans = text.scan(r)
|
39
|
+
scans.collect {|scan| Houdini::MatchResult.new(scan, @captures) }
|
40
|
+
end
|
41
|
+
|
42
|
+
protected
|
43
|
+
def replace_expressions!(regex)
|
44
|
+
@expressions.each do |name, r|
|
45
|
+
if regex =~ /\(#{name}\)/
|
46
|
+
@captures << name
|
47
|
+
end
|
48
|
+
r = Houdini[r].source if r.class == Symbol
|
49
|
+
r = r.source if r.class == Regexp
|
50
|
+
regex.gsub!(name, r)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
class String
|
2
|
+
def hmatch(options=nil, &block)
|
3
|
+
regex_helper = Houdini::RegexHelper.new()
|
4
|
+
regex_helper.instance_eval(&block)
|
5
|
+
regex_helper.match(self)
|
6
|
+
end
|
7
|
+
|
8
|
+
def hscan(options=nil, &block)
|
9
|
+
regex_helper = Houdini::RegexHelper.new()
|
10
|
+
regex_helper.instance_eval(&block)
|
11
|
+
regex_helper.scan(self)
|
12
|
+
end
|
13
|
+
end
|
data/lib/houdinirx.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
require "houdinirx/version"
|
5
|
+
require "houdinirx/string"
|
6
|
+
require "houdinirx/regex_helper"
|
7
|
+
require "houdinirx/match_result"
|
8
|
+
|
9
|
+
module Houdini
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def [](name)
|
13
|
+
definitions[name]
|
14
|
+
end
|
15
|
+
|
16
|
+
def define(name, regex)
|
17
|
+
definitions[name] = regex
|
18
|
+
end
|
19
|
+
|
20
|
+
def definitions
|
21
|
+
@@def ||= {}
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: houdinirx
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Eric Campbell
|
14
|
+
- Duncan Mak
|
15
|
+
autorequire:
|
16
|
+
bindir: bin
|
17
|
+
cert_chain: []
|
18
|
+
|
19
|
+
date: 2011-08-01 00:00:00 Z
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: A ruby DSL for building long complex regular expressions used frequently across large data extraction projects
|
23
|
+
email:
|
24
|
+
- ericcampbell59@gmail.com
|
25
|
+
- duncanmak@gmail.com
|
26
|
+
executables: []
|
27
|
+
|
28
|
+
extensions: []
|
29
|
+
|
30
|
+
extra_rdoc_files: []
|
31
|
+
|
32
|
+
files:
|
33
|
+
- .gitignore
|
34
|
+
- Gemfile
|
35
|
+
- README
|
36
|
+
- Rakefile
|
37
|
+
- houdinirx.gemspec
|
38
|
+
- lib/houdinirx.rb
|
39
|
+
- lib/houdinirx/match_result.rb
|
40
|
+
- lib/houdinirx/regex_helper.rb
|
41
|
+
- lib/houdinirx/string.rb
|
42
|
+
- lib/houdinirx/version.rb
|
43
|
+
homepage: ""
|
44
|
+
licenses: []
|
45
|
+
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
hash: 3
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
68
|
+
version: "0"
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
rubyforge_project: houdinirx
|
72
|
+
rubygems_version: 1.8.5
|
73
|
+
signing_key:
|
74
|
+
specification_version: 3
|
75
|
+
summary: A Ruby DSL for building regular expressions
|
76
|
+
test_files: []
|
77
|
+
|