regex 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HISTORY +18 -1
- data/LICENSE +202 -21
- data/PACKAGE +7 -0
- data/PROFILE +20 -0
- data/README +76 -21
- data/bin/regex +1 -1
- data/lib/regex.rb +19 -228
- data/lib/regex/command.rb +16 -97
- data/lib/regex/extractor.rb +481 -0
- data/lib/regex/package.yml +7 -0
- data/lib/regex/replacer.rb +221 -0
- data/lib/regex/string.rb +1 -1
- data/lib/regex/templates.rb +85 -0
- data/{test/demos → qed}/regex.rdoc +5 -5
- data/qed/replacer.rdoc +57 -0
- metadata +54 -29
- data/MANIFEST +0 -25
- data/lib/regex/templates/common.rb +0 -13
- data/meta/authors +0 -2
- data/meta/created +0 -1
- data/meta/description +0 -1
- data/meta/download +0 -1
- data/meta/homepage +0 -1
- data/meta/mailinglist +0 -1
- data/meta/name +0 -1
- data/meta/repository +0 -1
- data/meta/summary +0 -1
- data/meta/title +0 -1
- data/meta/version +0 -1
data/lib/regex/command.rb
CHANGED
@@ -1,107 +1,26 @@
|
|
1
|
-
require 'regex'
|
1
|
+
require 'regex/extractor'
|
2
|
+
require 'regex/replacer'
|
2
3
|
|
3
|
-
|
4
|
+
module Regex
|
4
5
|
|
5
6
|
# Commandline interface.
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
#
|
13
|
-
attr :format
|
14
|
-
|
15
|
-
#
|
16
|
-
attr :options
|
17
|
-
|
18
|
-
#
|
19
|
-
def self.main(*argv)
|
20
|
-
new(*argv).main
|
21
|
-
end
|
22
|
-
|
23
|
-
# New Command.
|
24
|
-
def initialize(*argv)
|
25
|
-
@file = nil
|
26
|
-
@format = nil
|
27
|
-
@options = {}
|
28
|
-
parse(*argv)
|
7
|
+
def self.cli(*argv)
|
8
|
+
if argv.include?('-r') or argv.include?('--replace')
|
9
|
+
controller = Replacer
|
10
|
+
else
|
11
|
+
controller = Extractor
|
29
12
|
end
|
30
13
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
unless File.file?(@file)
|
40
|
-
puts "No such file -- '#{file}'."
|
41
|
-
exit 1
|
42
|
-
end
|
14
|
+
begin
|
15
|
+
controller.cli(argv)
|
16
|
+
rescue => error
|
17
|
+
if $DEBUG
|
18
|
+
raise error
|
19
|
+
#puts error.backtrace.join("\n ")
|
20
|
+
else
|
21
|
+
abort error.to_s
|
43
22
|
end
|
44
23
|
end
|
45
|
-
|
46
|
-
# OptionParser instance.
|
47
|
-
def parser
|
48
|
-
require 'optparse'
|
49
|
-
@options = {}
|
50
|
-
OptionParser.new do |opt|
|
51
|
-
opt.on('--template', '-t NAME', "select a built-in regular expression") do |name|
|
52
|
-
@options[:template] = name
|
53
|
-
end
|
54
|
-
|
55
|
-
opt.on('--index', '-n INT', "return a specific match index") do |int|
|
56
|
-
@options[:index] = int.to_i
|
57
|
-
end
|
58
|
-
|
59
|
-
opt.on('--insensitive', '-i', "case insensitive matching") do
|
60
|
-
@options[:insensitive] = true
|
61
|
-
end
|
62
|
-
|
63
|
-
opt.on('--unxml', '-x', "ignore XML/HTML tags") do
|
64
|
-
@options[:unxml] = true
|
65
|
-
end
|
66
|
-
|
67
|
-
opt.on('--repeat', '-r', "find all matching occurances") do
|
68
|
-
@options[:repeat] = true
|
69
|
-
end
|
70
|
-
|
71
|
-
opt.on('--yaml', '-y', "output in YAML format") do
|
72
|
-
@format = :yaml
|
73
|
-
end
|
74
|
-
|
75
|
-
opt.on('--json', '-j', "output in JSON format") do
|
76
|
-
@format = :json
|
77
|
-
end
|
78
|
-
|
79
|
-
opt.on_tail('--help', '-h', "display this lovely help message") do
|
80
|
-
puts opt
|
81
|
-
exit 0
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
#
|
87
|
-
def extraction
|
88
|
-
target = file ? File.new(file) : ARGF
|
89
|
-
Regex.new(target, options)
|
90
|
-
end
|
91
|
-
|
92
|
-
# Extract and display.
|
93
|
-
def main
|
94
|
-
begin
|
95
|
-
puts extraction.to_s(@format)
|
96
|
-
rescue => error
|
97
|
-
if $DEBUG
|
98
|
-
raise error
|
99
|
-
else
|
100
|
-
abort error.to_s
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
24
|
end
|
106
25
|
|
107
26
|
end
|
data/lib/regex/extractor.rb
CHANGED
@@ -1 +1,482 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'regex/string'
|
4
|
+
|
5
|
+
module Regex
|
6
|
+
|
7
|
+
# Supports [:name:] notation for subsitution of built-in templates.
|
8
|
+
class Extractor
|
9
|
+
|
10
|
+
# When the regular expression return multiple groups,
|
11
|
+
# each is divided by the group deliminator.
|
12
|
+
# This is the default value.
|
13
|
+
DELIMINATOR_GROUP = 29.chr + "\n"
|
14
|
+
|
15
|
+
# When using repeat mode, each match is divided by
|
16
|
+
# the record deliminator. This is the default value.
|
17
|
+
DELIMINATOR_RECORD = 30.chr + "\n"
|
18
|
+
|
19
|
+
#
|
20
|
+
def self.input_cache(input)
|
21
|
+
@input_cache ||= {}
|
22
|
+
@input_cache[input] ||= (
|
23
|
+
case input
|
24
|
+
when String
|
25
|
+
input
|
26
|
+
else
|
27
|
+
input.read
|
28
|
+
end
|
29
|
+
)
|
30
|
+
end
|
31
|
+
|
32
|
+
# List of IO objects or Strings to search.
|
33
|
+
attr_accessor :io
|
34
|
+
|
35
|
+
# Remove XML tags from search. (NOT CURRENTLY SUPPORTED)
|
36
|
+
attr_accessor :unxml
|
37
|
+
|
38
|
+
# Regular expression.
|
39
|
+
attr_accessor :pattern
|
40
|
+
|
41
|
+
# Select built-in regular expression by name.
|
42
|
+
attr_accessor :template
|
43
|
+
|
44
|
+
# Index of expression return.
|
45
|
+
attr_accessor :index
|
46
|
+
|
47
|
+
# Multiline match.
|
48
|
+
attr_accessor :multiline
|
49
|
+
|
50
|
+
# Ignore case.
|
51
|
+
attr_accessor :insensitive
|
52
|
+
|
53
|
+
# Escape expression.
|
54
|
+
attr_accessor :escape
|
55
|
+
|
56
|
+
# Repeat Match.
|
57
|
+
attr_accessor :repeat
|
58
|
+
|
59
|
+
# Output format.
|
60
|
+
attr_accessor :format
|
61
|
+
|
62
|
+
# Provide detailed output.
|
63
|
+
attr_accessor :detail
|
64
|
+
|
65
|
+
# Use ANSI codes in output?
|
66
|
+
attr_accessor :ansi
|
67
|
+
|
68
|
+
# Use ANSI codes in output?
|
69
|
+
def ansi? ; @ansi ; end
|
70
|
+
|
71
|
+
# New extractor.
|
72
|
+
def initialize(*io)
|
73
|
+
options = Hash === io.last ? io.pop : {}
|
74
|
+
|
75
|
+
@io = io
|
76
|
+
@ansi = true
|
77
|
+
|
78
|
+
options.each do |k,v|
|
79
|
+
__send__("#{k}=", v)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
def inspect
|
85
|
+
"#{self.class.name}"
|
86
|
+
end
|
87
|
+
|
88
|
+
#--
|
89
|
+
# TODO: unxml is too primative, use real xml parser like nokogiri
|
90
|
+
#++
|
91
|
+
#def text
|
92
|
+
# @text ||= (
|
93
|
+
# if unxml
|
94
|
+
# raw.gsub!(/\<(.*?)\>/, '')
|
95
|
+
# else
|
96
|
+
# @raw
|
97
|
+
# end
|
98
|
+
# )
|
99
|
+
#end
|
100
|
+
|
101
|
+
#
|
102
|
+
def regex
|
103
|
+
@regex ||= (
|
104
|
+
if template
|
105
|
+
Templates.const_get(template.upcase)
|
106
|
+
else
|
107
|
+
case pattern
|
108
|
+
when Regexp
|
109
|
+
pattern
|
110
|
+
when String
|
111
|
+
flags = 0
|
112
|
+
flags + Regexp::MULTILINE if multiline
|
113
|
+
flags + Regexp::IGNORECASE if insensitive
|
114
|
+
if escape
|
115
|
+
Regexp.new(Regexp.escape(pattern), flags)
|
116
|
+
else
|
117
|
+
pat = substitute_templates(pattern)
|
118
|
+
Regexp.new(pat, flags)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
)
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
def substitute_templates(pattern)
|
127
|
+
pat = pattern
|
128
|
+
Templates.list.each do |name|
|
129
|
+
if pat.include?("[:#{name}:]")
|
130
|
+
pat = pat.gsub(/(?!:\\)\[\:#{name}\:\]/, Templates[name].to_s)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
pat
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
def to_s(format=nil)
|
138
|
+
case format
|
139
|
+
when :yaml
|
140
|
+
to_s_yaml
|
141
|
+
when :json
|
142
|
+
to_s_json
|
143
|
+
else
|
144
|
+
if detail
|
145
|
+
output_detailed_text
|
146
|
+
else
|
147
|
+
output_text
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
#
|
153
|
+
def to_s_yaml
|
154
|
+
require 'yaml'
|
155
|
+
if detail
|
156
|
+
matches_by_path.to_yaml
|
157
|
+
else
|
158
|
+
structure.to_yaml
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
#
|
163
|
+
def to_s_json
|
164
|
+
begin
|
165
|
+
require 'json'
|
166
|
+
rescue LoadError
|
167
|
+
require 'json_pure'
|
168
|
+
end
|
169
|
+
if detail
|
170
|
+
matches_by_path.to_json
|
171
|
+
else
|
172
|
+
structure.to_json
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
#
|
177
|
+
def output_text
|
178
|
+
out = structure
|
179
|
+
if repeat
|
180
|
+
out = out.map{ |m| m.join(deliminator_group) }
|
181
|
+
out = out.join(deliminator_record) #.chomp("\n") + "\n"
|
182
|
+
else
|
183
|
+
out = out.join(deliminator_group) #.chomp("\n") + "\n"
|
184
|
+
end
|
185
|
+
out
|
186
|
+
end
|
187
|
+
|
188
|
+
# Detailed text output.
|
189
|
+
def output_detailed_text
|
190
|
+
if repeat
|
191
|
+
count = 0
|
192
|
+
string = []
|
193
|
+
mapping.each do |input, matches|
|
194
|
+
path = (File === input ? input.path : "(io #{input.object_id})")
|
195
|
+
string << ""
|
196
|
+
string << bold(path)
|
197
|
+
matches.each do |match|
|
198
|
+
string << formatted_match(input, match)
|
199
|
+
count += 1
|
200
|
+
end
|
201
|
+
end
|
202
|
+
string.join("\n") + "\n"
|
203
|
+
string << "\n(#{count} matches)"
|
204
|
+
else
|
205
|
+
string = []
|
206
|
+
match = scan.first
|
207
|
+
input = match.input
|
208
|
+
path = (File === input ? input.path : "(io #{input.object_id})")
|
209
|
+
string << ""
|
210
|
+
string << bold(path)
|
211
|
+
string << formatted_match(input, match)
|
212
|
+
string.join("\n")
|
213
|
+
string << "" #"\n1 match"
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
#
|
218
|
+
def formatted_match(input, match)
|
219
|
+
string = []
|
220
|
+
path = (File === input ? input.path : "(io #{input.object_id})")
|
221
|
+
part, char, line = match.info(0)
|
222
|
+
if index
|
223
|
+
part, char, line = match.info(index)
|
224
|
+
string << "%s %s %s" % [line, char, part.inspect]
|
225
|
+
else
|
226
|
+
string << bold("%s %s %s" % [line, char, part.inspect])
|
227
|
+
if match.size > 0
|
228
|
+
(1...match.size).each do |i|
|
229
|
+
part, char, line = match.info(i)
|
230
|
+
string << "#{i}. %s %s %s" % [line, char, part.inspect]
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
string.join("\n")
|
235
|
+
end
|
236
|
+
|
237
|
+
#
|
238
|
+
def matches_by_path
|
239
|
+
r = Hash.new{ |h,k| h[k] = [] }
|
240
|
+
h = Hash.new{ |h,k| h[k] = [] }
|
241
|
+
scan.each do |match|
|
242
|
+
h[match.input] << match
|
243
|
+
end
|
244
|
+
h.each do |input, matches|
|
245
|
+
path = (File === input ? input.path : "(io #{input.object_id})")
|
246
|
+
if index
|
247
|
+
matches.each do |match|
|
248
|
+
r[path] << match.breakdown[index]
|
249
|
+
end
|
250
|
+
else
|
251
|
+
matches.each do |match|
|
252
|
+
r[path] << match.breakdown
|
253
|
+
end
|
254
|
+
end
|
255
|
+
end
|
256
|
+
r
|
257
|
+
end
|
258
|
+
|
259
|
+
# Structure the matchdata according to specified options.
|
260
|
+
def structure
|
261
|
+
repeat ? structure_repeat : structure_single
|
262
|
+
end
|
263
|
+
|
264
|
+
# Structure the matchdata for single match.
|
265
|
+
def structure_single
|
266
|
+
structure_repeat.first
|
267
|
+
end
|
268
|
+
|
269
|
+
# Structure the matchdata for repeat matches.
|
270
|
+
def structure_repeat
|
271
|
+
if index
|
272
|
+
scan.map{ |match| [match[index]] }
|
273
|
+
else
|
274
|
+
scan.map{ |match| match.size > 1 ? match[1..-1] : [match[0]] }
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
# Scan inputs for matches.
|
279
|
+
#
|
280
|
+
# Return an associative Array of [input, matchdata].
|
281
|
+
def scan
|
282
|
+
list = []
|
283
|
+
io.each do |input|
|
284
|
+
text = read(input)
|
285
|
+
text.scan(regex) do
|
286
|
+
list << Match.new(input, $~)
|
287
|
+
end
|
288
|
+
end
|
289
|
+
list
|
290
|
+
end
|
291
|
+
|
292
|
+
#
|
293
|
+
def mapping
|
294
|
+
hash = Hash.new{ |h,k| h[k]=[] }
|
295
|
+
scan.each do |match|
|
296
|
+
hash[match.input] << match
|
297
|
+
end
|
298
|
+
hash
|
299
|
+
end
|
300
|
+
|
301
|
+
# TODO: unxml won't give corrent char counts.
|
302
|
+
def read(input)
|
303
|
+
Extractor.input_cache(input)
|
304
|
+
# if unxml
|
305
|
+
# txt.gsub(/\<(.*?)\>/, '')
|
306
|
+
# else
|
307
|
+
# txt
|
308
|
+
# end
|
309
|
+
end
|
310
|
+
|
311
|
+
# Return the line number of the +char+ position within +text+.
|
312
|
+
def line_at(io, char)
|
313
|
+
read(io)[0..char].count("\n") + 1
|
314
|
+
end
|
315
|
+
|
316
|
+
def deliminator_group
|
317
|
+
DELIMINATOR_GROUP
|
318
|
+
end
|
319
|
+
|
320
|
+
def deliminator_record
|
321
|
+
DELIMINATOR_RECORD
|
322
|
+
end
|
323
|
+
|
324
|
+
# Commandline Interface to Extractor.
|
325
|
+
def self.cli(argv=ARGV)
|
326
|
+
require 'optparse'
|
327
|
+
format = nil
|
328
|
+
options = {}
|
329
|
+
parser = OptionParser.new do |opt|
|
330
|
+
opt.on('--template', '-t NAME', "select a built-in regular expression") do |name|
|
331
|
+
options[:template] = name
|
332
|
+
end
|
333
|
+
opt.on('--search', '-s PATTERN', "search for regular expression") do |re|
|
334
|
+
options[:pattern] = re
|
335
|
+
end
|
336
|
+
opt.on('--index', '-n INT', "return a specific match index") do |int|
|
337
|
+
options[:index] = int.to_i
|
338
|
+
end
|
339
|
+
opt.on('--insensitive', '-i', "case insensitive matching") do
|
340
|
+
options[:insensitive] = true
|
341
|
+
end
|
342
|
+
opt.on('--multiline', '-m', "multiline matching") do
|
343
|
+
options[:multiline] = true
|
344
|
+
end
|
345
|
+
#opt.on('--unxml', '-x', "ignore XML/HTML tags") do
|
346
|
+
# options[:unxml] = true
|
347
|
+
#end
|
348
|
+
opt.on('--global', '-g', "find all matching occurances") do
|
349
|
+
options[:repeat] = true
|
350
|
+
end
|
351
|
+
opt.on('--yaml', '-y', "output in YAML format") do
|
352
|
+
format = :yaml
|
353
|
+
end
|
354
|
+
opt.on('--json', '-j', "output in JSON format") do
|
355
|
+
format = :json
|
356
|
+
end
|
357
|
+
opt.on('--detail', '-d', "provide match details") do
|
358
|
+
options[:detail] = :json
|
359
|
+
end
|
360
|
+
opt.on('--[no-]ansi', "toggle ansi color") do |val|
|
361
|
+
options[:ansi] = val
|
362
|
+
end
|
363
|
+
opt.on_tail('--debug', 'run in debug mode') do
|
364
|
+
$DEBUG = true
|
365
|
+
end
|
366
|
+
opt.on_tail('--help', '-h', "display this lovely help message") do
|
367
|
+
puts opt
|
368
|
+
exit 0
|
369
|
+
end
|
370
|
+
end
|
371
|
+
parser.parse!(argv)
|
372
|
+
|
373
|
+
unless options[:pattern] or options[:template]
|
374
|
+
re = argv.shift
|
375
|
+
case re
|
376
|
+
when /^\/(.*?)\/(\w*?)$/
|
377
|
+
options[:pattern] = $1
|
378
|
+
$2.split(//).each do |c|
|
379
|
+
case c
|
380
|
+
when 'e' then options[:escape] = true
|
381
|
+
when 'g' then options[:repeat] = true
|
382
|
+
when 'i' then options[:insensitive] = true
|
383
|
+
end
|
384
|
+
end
|
385
|
+
else
|
386
|
+
options[:template] = re
|
387
|
+
end
|
388
|
+
end
|
389
|
+
|
390
|
+
files = argv
|
391
|
+
|
392
|
+
files.each do |file|
|
393
|
+
if !File.file?(file)
|
394
|
+
$stderr.puts "No such file -- '#{file}'."
|
395
|
+
exit 1
|
396
|
+
end
|
397
|
+
end
|
398
|
+
|
399
|
+
if files.empty?
|
400
|
+
args = [ARGF]
|
401
|
+
else
|
402
|
+
args = files.map{ |f| open(f) } #File.new(f) }
|
403
|
+
end
|
404
|
+
|
405
|
+
args << options
|
406
|
+
|
407
|
+
extract = new(*args)
|
408
|
+
|
409
|
+
puts extract.to_s(format)
|
410
|
+
end
|
411
|
+
|
412
|
+
#
|
413
|
+
def bold(str)
|
414
|
+
if ansi?
|
415
|
+
"\e[1m" + str + "\e[0m"
|
416
|
+
else
|
417
|
+
string
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
|
422
|
+
#
|
423
|
+
class Match
|
424
|
+
attr :input
|
425
|
+
attr :match
|
426
|
+
|
427
|
+
# match - Instance of MatchData
|
428
|
+
#
|
429
|
+
def initialize(input, match)
|
430
|
+
@input = input
|
431
|
+
@match = match
|
432
|
+
end
|
433
|
+
|
434
|
+
#
|
435
|
+
def [](i)
|
436
|
+
@match[i]
|
437
|
+
end
|
438
|
+
|
439
|
+
#
|
440
|
+
def size
|
441
|
+
@match.size
|
442
|
+
end
|
443
|
+
|
444
|
+
#
|
445
|
+
def breakdown
|
446
|
+
m = []
|
447
|
+
range = (0...match.size)
|
448
|
+
range.each do |i|
|
449
|
+
char = match.offset(i)[0]
|
450
|
+
line = line_at(char)
|
451
|
+
part = match[i]
|
452
|
+
m << {'index'=>i, 'line'=>line, 'char'=>char, 'text'=>part}
|
453
|
+
end
|
454
|
+
m
|
455
|
+
end
|
456
|
+
|
457
|
+
#
|
458
|
+
def info(index)
|
459
|
+
text = match[index]
|
460
|
+
char = match.offset(index)[0]
|
461
|
+
line = line_at(char)
|
462
|
+
return text, char, line
|
463
|
+
end
|
464
|
+
|
465
|
+
# Return the line number of the +char+ position within +text+.
|
466
|
+
def line_at(char)
|
467
|
+
return nil unless char
|
468
|
+
text[0..char].count("\n") + 1
|
469
|
+
end
|
470
|
+
|
471
|
+
#
|
472
|
+
def text
|
473
|
+
Extractor.input_cache(input)
|
474
|
+
end
|
475
|
+
|
476
|
+
end
|
477
|
+
|
478
|
+
|
479
|
+
end
|
480
|
+
|
481
|
+
end
|
1
482
|
|