template_parser 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/Gemfile +4 -0
- data/README.md +79 -0
- data/Rakefile +2 -0
- data/lib/template_parser.rb +248 -0
- data/lib/template_parser/version.rb +3 -0
- data/template_parser.gemspec +21 -0
- metadata +63 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
# Template Parser
|
2
|
+
|
3
|
+
Parse ASCII files by example
|
4
|
+
|
5
|
+
## Basic usage
|
6
|
+
|
7
|
+
First, create a template. In this example I'm parsing an ASCII purchase
|
8
|
+
order of some sort.
|
9
|
+
|
10
|
+
template = TemplateParser.compile_template([
|
11
|
+
"?<:>REPORT NUMBER: ? ",
|
12
|
+
" INVENTORY NO: #number REF NUMBER: #refer_number LOGICAL UNIT: #unit_number ? ",
|
13
|
+
" CATALOG CODE: :cat_code :sub_code VENDOR NO: :vendor ",
|
14
|
+
" TYPE CODE: :lpr FLAG: ? BILL FLAG: <:bill_flag>#_ SHIPPING FROM: ? SHIPPING TO: ? ",
|
15
|
+
" FROM-LOCATION: :address_code TO LOCATION: :to_address_code ",
|
16
|
+
" :address_name ? ",
|
17
|
+
" :address_1 ? ",
|
18
|
+
" :address_2 ? ",
|
19
|
+
" :address_city :address_postal ? ",
|
20
|
+
" USER NAME: :user_name SERIAL NUMBER: :serial "
|
21
|
+
].join("\n"))
|
22
|
+
|
23
|
+
Let's break down the template a little bit.
|
24
|
+
|
25
|
+
The first thing to note is that it's not hard to imagine what the date
|
26
|
+
we're parsing will actually look like. That's because the field
|
27
|
+
definitions go in the physical locations where the data should be, while
|
28
|
+
the rest of the report appears unchanged. While parsing a report, if a
|
29
|
+
single character is out of place, parsing will fail with a detailed
|
30
|
+
error message. I've found that by failing fast I've been able to find
|
31
|
+
the edge cases easily and get perfect parsing results on literally
|
32
|
+
gigabytes of generated reports.
|
33
|
+
|
34
|
+
There are a few different types of fields visible here as well. Some
|
35
|
+
start with #, indicating a numeric field. Most start with : and look
|
36
|
+
like symbols, indicating a text field. There are also some ?'s
|
37
|
+
indicating that something may appear there but we'll ignore it. Finally
|
38
|
+
there are some zero-width field names which look like <:name>:_ which
|
39
|
+
can appear where the field name would otherwise not fit. A variation of
|
40
|
+
that is where <:> can be used by itself as a 0-width delimiter to
|
41
|
+
prevent a field from being too long.
|
42
|
+
|
43
|
+
Side note: typically I would use [ruby here doc](http://blog.jayfields.com/2006/12/ruby-multiline-strings-here-doc-or.html)
|
44
|
+
string notation but here am concatinating an array of strings to help
|
45
|
+
demonstrate that whitespace is significant. That way I can just copy in
|
46
|
+
an example of the report I'm interested in and carve out my field
|
47
|
+
definitions directly.
|
48
|
+
|
49
|
+
An array of line matchers are returned from the compile_template method.
|
50
|
+
|
51
|
+
### Using the array of line matchers (template)
|
52
|
+
|
53
|
+
Does the given line have a match in any of the lines in the template?
|
54
|
+
|
55
|
+
TemplateParser.match_template?(template, line)
|
56
|
+
|
57
|
+
Get the results of matching any line in the template to the given line
|
58
|
+
|
59
|
+
TemplateParser.match_template(template, line, file_position_metadata) { |matcher, converted_data, raw_data| }
|
60
|
+
|
61
|
+
Process all given lines against the template in order
|
62
|
+
|
63
|
+
TemplateParser.process_lines(template, lines, file_position_metadata)
|
64
|
+
|
65
|
+
Return true if process_lines would run successfully on the given lines
|
66
|
+
for the given template.
|
67
|
+
|
68
|
+
TemplateParser.lines_match_template?(template, lines)
|
69
|
+
|
70
|
+
### Using individual line matchers
|
71
|
+
|
72
|
+
Does the given line match the given line matcher?
|
73
|
+
|
74
|
+
TemplateParser.match_line?(line_matcher, line)
|
75
|
+
|
76
|
+
Process a given line against a given line matcher
|
77
|
+
|
78
|
+
TemplateParser.process_line(line_matcher, line, file_position_metadata) { |matcher, converted_data, raw_data| }
|
79
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,248 @@
|
|
1
|
+
module TemplateParser
|
2
|
+
class ProcessingError < StandardError
|
3
|
+
attr_reader :matcher, :line, :pos, :meta
|
4
|
+
|
5
|
+
def initialize(matcher, line, pos, meta, message)
|
6
|
+
@matcher, @line, @pos, @meta = matcher, line, pos, meta
|
7
|
+
super(message)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
class ProcessingErrors < StandardError
|
12
|
+
attr_reader :errors
|
13
|
+
|
14
|
+
def initialize(message, errors)
|
15
|
+
@errors = errors
|
16
|
+
super(message + "\n\n" + errors.map { |e| e.message }.compact.join("\n-----------------------------------------------------------\n"))
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
module Parser
|
21
|
+
# Create an array of line matchers based on a template.
|
22
|
+
def compile_template(template)
|
23
|
+
template_lines = template.to_enum(:each_line).map { |line| line.chomp }
|
24
|
+
template_lines.map do |line|
|
25
|
+
compile_template_line(line)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Does the given line match the given line matcher?
|
30
|
+
def match_line?(matchers, line)
|
31
|
+
matchers.first[:regex] =~ line
|
32
|
+
end
|
33
|
+
|
34
|
+
# Does the given line have a match in any of the lines in the template?
|
35
|
+
def match_template?(template, line)
|
36
|
+
template.detect { |matcher| match_line?(matcher, line) }
|
37
|
+
end
|
38
|
+
|
39
|
+
# Return true if process_lines would run successfully on the given
|
40
|
+
# lines for the given template.
|
41
|
+
def lines_match_template?(template, lines)
|
42
|
+
template.zip(lines).all? do |matchers, line|
|
43
|
+
match_line?(matchers, line)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# Get the results of matching any line in the template to the given line
|
48
|
+
def match_template(template, line, meta = {})
|
49
|
+
matcher = match_template? template, line
|
50
|
+
if matcher
|
51
|
+
if block_given?
|
52
|
+
process_line(matcher, line, meta) { |*x| yield *x }
|
53
|
+
else
|
54
|
+
process_line(matcher, line, meta)
|
55
|
+
end
|
56
|
+
else
|
57
|
+
errors = template.map do |matcher|
|
58
|
+
begin
|
59
|
+
process_line(matcher, line, meta) { |*x| }
|
60
|
+
rescue ProcessingError => e
|
61
|
+
e
|
62
|
+
end
|
63
|
+
end
|
64
|
+
raise ProcessingErrors.new("At least one of the following #{ template.length } template lines should match the given line", errors)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# Process all given lines against the template in order
|
69
|
+
def process_lines(line_matchers, lines, meta = {})
|
70
|
+
record = {}
|
71
|
+
line_matchers.zip(lines) do |matchers, line|
|
72
|
+
process_line(matchers, line, meta) do |matcher, data, raw|
|
73
|
+
record[matcher[:symbol]] = data if data != ''
|
74
|
+
end
|
75
|
+
end
|
76
|
+
record
|
77
|
+
end
|
78
|
+
|
79
|
+
def formatters(template)
|
80
|
+
template.map { |matcher| matcher.first[:formatter] }
|
81
|
+
end
|
82
|
+
|
83
|
+
def format_any_line(template, data)
|
84
|
+
formatter = formatters(template).detect do |formatter|
|
85
|
+
formatter[:lengths].all? do |name, length|
|
86
|
+
data[name] and data[name].to_s.length <= length
|
87
|
+
end
|
88
|
+
end
|
89
|
+
if formatter
|
90
|
+
formatter[:format] % formatter[:keys].map { |key| data[key] }
|
91
|
+
else
|
92
|
+
raise 'No formatter found'
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def format_template(template, data)
|
97
|
+
template.map do |matcher|
|
98
|
+
formatter = matcher.first[:formatter]
|
99
|
+
formatter[:format] % formatter[:keys].map { |key| data[key] }
|
100
|
+
end.join "\n"
|
101
|
+
end
|
102
|
+
|
103
|
+
def process_any_line(template, line, meta = {})
|
104
|
+
record = {}
|
105
|
+
match_template(template, line) do |matcher, data, raw|
|
106
|
+
record[matcher[:symbol]] = data if data != ''
|
107
|
+
end
|
108
|
+
record
|
109
|
+
end
|
110
|
+
|
111
|
+
# Process a given line against a given line matcher
|
112
|
+
def process_line(matchers, line, meta = {})
|
113
|
+
pos = 0
|
114
|
+
unless block_given?
|
115
|
+
record = {}
|
116
|
+
process_line(matchers, line, meta) do |m, d, lp|
|
117
|
+
record[m[:symbol]] = d
|
118
|
+
end
|
119
|
+
return record
|
120
|
+
end
|
121
|
+
matchers.each do |matcher|
|
122
|
+
line_part = line[pos, matcher[:length]]
|
123
|
+
processing_error!('Unexpected EOL', matcher, line, pos, meta) unless line_part
|
124
|
+
#processing_error!('Unexpected EOL', matcher, line, pos, meta) if line_part.length < matcher[:length]
|
125
|
+
case matcher[:type]
|
126
|
+
when :string
|
127
|
+
if matcher[:string] != line_part
|
128
|
+
processing_error!("Mismatch: #{ line_part.inspect } should be #{ matcher[:string].inspect }", matcher, line, pos, meta)
|
129
|
+
end
|
130
|
+
yield matcher, line_part, line_part if matcher[:symbol]
|
131
|
+
when :data
|
132
|
+
yield matcher, line_part.strip, line_part
|
133
|
+
when :int
|
134
|
+
data = line_part.strip
|
135
|
+
if data == ''
|
136
|
+
yield matcher, nil, line_part
|
137
|
+
else
|
138
|
+
begin
|
139
|
+
yield matcher, Integer(data.sub(/^0*(\d)/, '\1')), line_part
|
140
|
+
rescue => e
|
141
|
+
processing_error!(e.message, matcher, line, pos, meta)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
pos += matcher[:length]
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
private
|
150
|
+
|
151
|
+
def compile_template_line(line)
|
152
|
+
parts = line.split(/([#:]\w+\s*\]?|\?\s*\]?|<[#:]\w*>)/)
|
153
|
+
next_symbol = nil
|
154
|
+
next_type = nil
|
155
|
+
matchers = parts.map do |part|
|
156
|
+
len = part.length
|
157
|
+
if len > 0
|
158
|
+
if part =~ /^<[#:]\w*>$/
|
159
|
+
next_symbol = part[2..-2]
|
160
|
+
if next_symbol.length > 0
|
161
|
+
next_symbol = next_symbol.to_sym
|
162
|
+
next_type = part[1, 1] == '#' ? :int : :data
|
163
|
+
else
|
164
|
+
# <:> used as a 0-width delimiter
|
165
|
+
next_symbol = nil
|
166
|
+
end
|
167
|
+
nil
|
168
|
+
else
|
169
|
+
part = part[0..-2] if part[-1, 1] == ']'
|
170
|
+
matcher = case part[0, 1]
|
171
|
+
when ':'
|
172
|
+
{ :type => :data, :symbol => part.strip[1..-1].to_sym, :length => len, :template => line }
|
173
|
+
when '#'
|
174
|
+
{ :type => :int, :symbol => part.strip[1..-1].to_sym, :length => len, :template => line }
|
175
|
+
when '?'
|
176
|
+
{ :type => :ignore, :length => len, :template => line }
|
177
|
+
else
|
178
|
+
{ :type => :string, :string => part, :length => len, :template => line }
|
179
|
+
end
|
180
|
+
if next_symbol
|
181
|
+
matcher[:symbol] = next_symbol
|
182
|
+
matcher[:type] = next_type if matcher[:type] == :ignore
|
183
|
+
next_symbol = nil
|
184
|
+
end
|
185
|
+
matcher
|
186
|
+
end
|
187
|
+
end
|
188
|
+
end.compact
|
189
|
+
compile_regex(matchers)
|
190
|
+
compile_formatter(matchers)
|
191
|
+
matchers
|
192
|
+
end
|
193
|
+
|
194
|
+
def compile_regex(matchers)
|
195
|
+
str = matchers.map do |matcher|
|
196
|
+
case matcher[:type]
|
197
|
+
when :string
|
198
|
+
"(?:#{Regexp.escape matcher[:string]}|#{Regexp.escape(matcher[:string].rstrip)}$)"
|
199
|
+
when :int
|
200
|
+
"(?:[ 0-9]{#{matcher[:length]}}|[ 0-9]{,#{matcher[:length]}}$)"
|
201
|
+
when :data, :ignore
|
202
|
+
"(?:.{#{matcher[:length]}}|.{,#{matcher[:length]}}$)"
|
203
|
+
end
|
204
|
+
end.join('')
|
205
|
+
matchers.first[:regex] = Regexp.new("\\A#{ str }", Regexp::MULTILINE)
|
206
|
+
end
|
207
|
+
|
208
|
+
def compile_formatter(matchers)
|
209
|
+
keys = []
|
210
|
+
lengths = {}
|
211
|
+
str = matchers.map do |matcher|
|
212
|
+
if matcher[:symbol]
|
213
|
+
keys << matcher[:symbol]
|
214
|
+
lengths[matcher[:symbol]] = matcher[:length]
|
215
|
+
end
|
216
|
+
case matcher[:type]
|
217
|
+
when :data
|
218
|
+
"%-#{matcher[:length]}s"
|
219
|
+
when :int
|
220
|
+
"%#{matcher[:length]}s"
|
221
|
+
when :ignore
|
222
|
+
" " * matcher[:length]
|
223
|
+
when :string
|
224
|
+
matcher[:string]
|
225
|
+
end
|
226
|
+
end.join('')
|
227
|
+
matchers.first[:formatter] = { :format => str, :keys => keys, :lengths => lengths }
|
228
|
+
end
|
229
|
+
|
230
|
+
def processing_error!(message, matcher, line, pos, meta)
|
231
|
+
message = <<-MESSAGE
|
232
|
+
#{ message }:
|
233
|
+
#{ matcher[:template].inspect.gsub(/<[:#]\w*>/, '') }
|
234
|
+
#{ line.inspect }
|
235
|
+
#{ ' ' * pos }^#{ '^' * (matcher[:length] > 0 ? matcher[:length] - 1 : 0) }
|
236
|
+
file: #{ meta[:file] } @ #{ meta[:line_num] }
|
237
|
+
MESSAGE
|
238
|
+
message += "\n\n#{ meta[:lines] }" if meta[:lines]
|
239
|
+
raise ProcessingError.new(matcher, line, pos, meta, message)
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
extend Parser
|
244
|
+
|
245
|
+
class Base
|
246
|
+
include Parser
|
247
|
+
end
|
248
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "template_parser/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "template_parser"
|
7
|
+
s.version = TemplateParser::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Darrick Wiebe"]
|
10
|
+
s.email = ["darrick@innatesoftware.com"]
|
11
|
+
s.homepage = "https://github.com/pangloss/template_parser"
|
12
|
+
s.summary = %q{Parse text files by example}
|
13
|
+
s.description = %q{When you need to parse crazy oldschool ascii reports from mainframes or legacy applications of all sorts, this tool can make it quite easy and keep your code concise and maintainable.}
|
14
|
+
|
15
|
+
s.rubyforge_project = "template_parser"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: template_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 1.0.0
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Darrick Wiebe
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-06-24 00:00:00 -04:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: When you need to parse crazy oldschool ascii reports from mainframes or legacy applications of all sorts, this tool can make it quite easy and keep your code concise and maintainable.
|
18
|
+
email:
|
19
|
+
- darrick@innatesoftware.com
|
20
|
+
executables: []
|
21
|
+
|
22
|
+
extensions: []
|
23
|
+
|
24
|
+
extra_rdoc_files: []
|
25
|
+
|
26
|
+
files:
|
27
|
+
- .gitignore
|
28
|
+
- Gemfile
|
29
|
+
- README.md
|
30
|
+
- Rakefile
|
31
|
+
- lib/template_parser.rb
|
32
|
+
- lib/template_parser/version.rb
|
33
|
+
- template_parser.gemspec
|
34
|
+
has_rdoc: true
|
35
|
+
homepage: https://github.com/pangloss/template_parser
|
36
|
+
licenses: []
|
37
|
+
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
none: false
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: "0"
|
55
|
+
requirements: []
|
56
|
+
|
57
|
+
rubyforge_project: template_parser
|
58
|
+
rubygems_version: 1.5.2
|
59
|
+
signing_key:
|
60
|
+
specification_version: 3
|
61
|
+
summary: Parse text files by example
|
62
|
+
test_files: []
|
63
|
+
|