text_extractor 0.1.7 → 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_extractor.rb +2 -26
- data/lib/text_extractor/directives.rb +3 -3
- data/lib/text_extractor/directives/classes.rb +5 -1
- data/lib/text_extractor/record.rb +45 -2
- data/lib/text_extractor/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33c60b21e8e025e62025f06af668dd4859cde35c
|
4
|
+
data.tar.gz: 3c3b0c21fc91326431f95d323daedc84f580ed53
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6e4d910641926e734e9c0cc22d486e15ac695c549ca1eb09c1e78621caa49e6e535533becb83dcd3f2ad93b81954cfc8efa74ade21f4b618a6f56ffe74ed978
|
7
|
+
data.tar.gz: 2ecc4bfd0e9e123e46254dbe264d5a102593a48a1668f9a0d41f7d66b22266925b58b21cf57c138121f4ece8559be3151ca33de6eb5905f965def79cbae096c6
|
data/lib/text_extractor.rb
CHANGED
@@ -65,34 +65,10 @@ class TextExtractor
|
|
65
65
|
value(id, re) { |val| IPAddr.new(val) }
|
66
66
|
end
|
67
67
|
|
68
|
-
def strip_record(regexp, strip: nil)
|
69
|
-
lines = regexp.source.split("\n")
|
70
|
-
prefix = lines.last
|
71
|
-
|
72
|
-
if prefix =~ /\A\s*\z/
|
73
|
-
lines.pop if lines.first =~ /\A\s*\z/
|
74
|
-
lines.shift
|
75
|
-
strip_record_by_line(lines, prefix, strip)
|
76
|
-
end
|
77
|
-
|
78
|
-
Regexp.new(lines.join("\n"), regexp.options)
|
79
|
-
end
|
80
|
-
|
81
|
-
def strip_record_by_line(lines, prefix, strip)
|
82
|
-
lines.map! { |s| s.gsub(prefix.to_s, '') }
|
83
|
-
case strip
|
84
|
-
when :left then lines.map! { |s| "\[ \t\r\f]*#{s.lstrip}" }
|
85
|
-
when :right then lines.map! { |s| "#{s.rstrip}\[ \t\r\f]*" }
|
86
|
-
when :both then lines.map! { |s| "\[ \t\r\f]*#{s.strip}\[ \t\r\f]*" }
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
68
|
def record(klass = Record, **kwargs, &block)
|
91
69
|
raise "#{self.class}.record requires a block" unless block
|
92
|
-
@current_record_values = []
|
93
|
-
|
94
|
-
kwargs[:values] = @current_record_values
|
95
|
-
@records << klass.new(regexp, **kwargs)
|
70
|
+
kwargs[:values] = @current_record_values = []
|
71
|
+
@records << klass.new(instance_exec(&block), **kwargs)
|
96
72
|
end
|
97
73
|
|
98
74
|
def filldown(**kwargs, &block)
|
@@ -39,7 +39,7 @@ class TextExtractor
|
|
39
39
|
private
|
40
40
|
|
41
41
|
DIRECTIVE_MAP = {
|
42
|
-
' ' => { class: Comment
|
42
|
+
' ' => { class: Comment },
|
43
43
|
'any' => { class: Any },
|
44
44
|
'begin' => { class: Begin, arguments: :parsed },
|
45
45
|
'end' => { class: End },
|
@@ -95,9 +95,9 @@ class TextExtractor
|
|
95
95
|
end
|
96
96
|
|
97
97
|
def parse_one_directive(source)
|
98
|
-
md = source.match(/^[a-z_]+/)
|
98
|
+
md = source.match(/^[a-z_]+/) || source.match(/^ /)
|
99
|
+
raise "Unknown directive(s) in #{@state.current_line}" unless md
|
99
100
|
word = md[0]
|
100
|
-
raise "Unknown directive(s) #{source}" unless md
|
101
101
|
map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
|
102
102
|
args = parse_arguments(map[:arguments], md.post_match)
|
103
103
|
map.fetch(:class).new(@state, *args)
|
@@ -73,7 +73,11 @@ class TextExtractor
|
|
73
73
|
# skip to end of line
|
74
74
|
class Rest < Directive
|
75
75
|
def call
|
76
|
-
state.current =
|
76
|
+
state.current = if state.newline?
|
77
|
+
[state.current.chomp, '[^\\n]*\n']
|
78
|
+
else
|
79
|
+
[state.current, '[^\\n]*']
|
80
|
+
end
|
77
81
|
end
|
78
82
|
end
|
79
83
|
end
|
@@ -4,8 +4,9 @@ class TextExtractor
|
|
4
4
|
class Record
|
5
5
|
attr_reader :regexp, :factory, :values
|
6
6
|
|
7
|
-
def initialize(regexp, factory: nil, values: [], fill: [], directives: true
|
8
|
-
|
7
|
+
def initialize(regexp, factory: nil, values: [], fill: [], directives: true,
|
8
|
+
strip: nil)
|
9
|
+
@regexp = build_regexp(regexp, directives, strip)
|
9
10
|
@factory = factory
|
10
11
|
@constructor = FactoryAnalyzer.new(factory).to_proc
|
11
12
|
@values = values.map { |val| [val.id, val] }.to_h
|
@@ -25,6 +26,33 @@ class TextExtractor
|
|
25
26
|
@constructor.call(extracted)
|
26
27
|
end
|
27
28
|
|
29
|
+
def build_regexp(regexp, directives, strip)
|
30
|
+
stripped = strip_regexp(regexp, strip)
|
31
|
+
expanded = expand_regexp(stripped, directives)
|
32
|
+
ignore_regexp(expanded, strip)
|
33
|
+
end
|
34
|
+
|
35
|
+
def strip_regexp(regexp, strip)
|
36
|
+
lines = regexp.source.split("\n")
|
37
|
+
prefix = lines.last
|
38
|
+
if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
|
39
|
+
lines.shift
|
40
|
+
lines = lines.map { |s| s.gsub(prefix, '') }
|
41
|
+
lines = lines.map(®exp_line_stripper(strip))
|
42
|
+
end
|
43
|
+
Regexp.new(lines.join("\n"), regexp.options)
|
44
|
+
end
|
45
|
+
|
46
|
+
def regexp_line_stripper(strip)
|
47
|
+
case strip
|
48
|
+
when :left then ->(s) { s.lstrip }
|
49
|
+
when :right then ->(s) { s.rstrip }
|
50
|
+
when :both then ->(s) { s.strip }
|
51
|
+
when nil, false then ->(s) { s }
|
52
|
+
else raise "Unknown strip option: #{strip}"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
28
56
|
def expand_regexp(regexp, directives)
|
29
57
|
if directives
|
30
58
|
TextExtractor.expand_directives(regexp)
|
@@ -33,6 +61,21 @@ class TextExtractor
|
|
33
61
|
end
|
34
62
|
end
|
35
63
|
|
64
|
+
def ignore_regexp(regexp, strip)
|
65
|
+
return regexp unless strip
|
66
|
+
lines = regexp.source.split("\n").map(®exp_line_ignorer(strip))
|
67
|
+
Regexp.new(lines.join("\n"), regexp.options)
|
68
|
+
end
|
69
|
+
|
70
|
+
def regexp_line_ignorer(strip)
|
71
|
+
case strip
|
72
|
+
when :left then ->(s) { "\[ \\t\\r\\f]*#{s}" }
|
73
|
+
when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
|
74
|
+
when :both then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
|
75
|
+
else raise "Unknown ignore whitespace option: #{strip}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
36
79
|
def match(string, pos = 0)
|
37
80
|
@regexp.match(string, pos)
|
38
81
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|