text_extractor 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_extractor.rb +2 -26
- data/lib/text_extractor/directives.rb +3 -3
- data/lib/text_extractor/directives/classes.rb +5 -1
- data/lib/text_extractor/record.rb +45 -2
- data/lib/text_extractor/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 33c60b21e8e025e62025f06af668dd4859cde35c
|
4
|
+
data.tar.gz: 3c3b0c21fc91326431f95d323daedc84f580ed53
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b6e4d910641926e734e9c0cc22d486e15ac695c549ca1eb09c1e78621caa49e6e535533becb83dcd3f2ad93b81954cfc8efa74ade21f4b618a6f56ffe74ed978
|
7
|
+
data.tar.gz: 2ecc4bfd0e9e123e46254dbe264d5a102593a48a1668f9a0d41f7d66b22266925b58b21cf57c138121f4ece8559be3151ca33de6eb5905f965def79cbae096c6
|
data/lib/text_extractor.rb
CHANGED
@@ -65,34 +65,10 @@ class TextExtractor
|
|
65
65
|
value(id, re) { |val| IPAddr.new(val) }
|
66
66
|
end
|
67
67
|
|
68
|
-
def strip_record(regexp, strip: nil)
|
69
|
-
lines = regexp.source.split("\n")
|
70
|
-
prefix = lines.last
|
71
|
-
|
72
|
-
if prefix =~ /\A\s*\z/
|
73
|
-
lines.pop if lines.first =~ /\A\s*\z/
|
74
|
-
lines.shift
|
75
|
-
strip_record_by_line(lines, prefix, strip)
|
76
|
-
end
|
77
|
-
|
78
|
-
Regexp.new(lines.join("\n"), regexp.options)
|
79
|
-
end
|
80
|
-
|
81
|
-
def strip_record_by_line(lines, prefix, strip)
|
82
|
-
lines.map! { |s| s.gsub(prefix.to_s, '') }
|
83
|
-
case strip
|
84
|
-
when :left then lines.map! { |s| "\[ \t\r\f]*#{s.lstrip}" }
|
85
|
-
when :right then lines.map! { |s| "#{s.rstrip}\[ \t\r\f]*" }
|
86
|
-
when :both then lines.map! { |s| "\[ \t\r\f]*#{s.strip}\[ \t\r\f]*" }
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
68
|
def record(klass = Record, **kwargs, &block)
|
91
69
|
raise "#{self.class}.record requires a block" unless block
|
92
|
-
@current_record_values = []
|
93
|
-
|
94
|
-
kwargs[:values] = @current_record_values
|
95
|
-
@records << klass.new(regexp, **kwargs)
|
70
|
+
kwargs[:values] = @current_record_values = []
|
71
|
+
@records << klass.new(instance_exec(&block), **kwargs)
|
96
72
|
end
|
97
73
|
|
98
74
|
def filldown(**kwargs, &block)
|
@@ -39,7 +39,7 @@ class TextExtractor
|
|
39
39
|
private
|
40
40
|
|
41
41
|
DIRECTIVE_MAP = {
|
42
|
-
' ' => { class: Comment
|
42
|
+
' ' => { class: Comment },
|
43
43
|
'any' => { class: Any },
|
44
44
|
'begin' => { class: Begin, arguments: :parsed },
|
45
45
|
'end' => { class: End },
|
@@ -95,9 +95,9 @@ class TextExtractor
|
|
95
95
|
end
|
96
96
|
|
97
97
|
def parse_one_directive(source)
|
98
|
-
md = source.match(/^[a-z_]+/)
|
98
|
+
md = source.match(/^[a-z_]+/) || source.match(/^ /)
|
99
|
+
raise "Unknown directive(s) in #{@state.current_line}" unless md
|
99
100
|
word = md[0]
|
100
|
-
raise "Unknown directive(s) #{source}" unless md
|
101
101
|
map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
|
102
102
|
args = parse_arguments(map[:arguments], md.post_match)
|
103
103
|
map.fetch(:class).new(@state, *args)
|
@@ -73,7 +73,11 @@ class TextExtractor
|
|
73
73
|
# skip to end of line
|
74
74
|
class Rest < Directive
|
75
75
|
def call
|
76
|
-
state.current =
|
76
|
+
state.current = if state.newline?
|
77
|
+
[state.current.chomp, '[^\\n]*\n']
|
78
|
+
else
|
79
|
+
[state.current, '[^\\n]*']
|
80
|
+
end
|
77
81
|
end
|
78
82
|
end
|
79
83
|
end
|
@@ -4,8 +4,9 @@ class TextExtractor
|
|
4
4
|
class Record
|
5
5
|
attr_reader :regexp, :factory, :values
|
6
6
|
|
7
|
-
def initialize(regexp, factory: nil, values: [], fill: [], directives: true
|
8
|
-
|
7
|
+
def initialize(regexp, factory: nil, values: [], fill: [], directives: true,
|
8
|
+
strip: nil)
|
9
|
+
@regexp = build_regexp(regexp, directives, strip)
|
9
10
|
@factory = factory
|
10
11
|
@constructor = FactoryAnalyzer.new(factory).to_proc
|
11
12
|
@values = values.map { |val| [val.id, val] }.to_h
|
@@ -25,6 +26,33 @@ class TextExtractor
|
|
25
26
|
@constructor.call(extracted)
|
26
27
|
end
|
27
28
|
|
29
|
+
def build_regexp(regexp, directives, strip)
|
30
|
+
stripped = strip_regexp(regexp, strip)
|
31
|
+
expanded = expand_regexp(stripped, directives)
|
32
|
+
ignore_regexp(expanded, strip)
|
33
|
+
end
|
34
|
+
|
35
|
+
def strip_regexp(regexp, strip)
|
36
|
+
lines = regexp.source.split("\n")
|
37
|
+
prefix = lines.last
|
38
|
+
if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
|
39
|
+
lines.shift
|
40
|
+
lines = lines.map { |s| s.gsub(prefix, '') }
|
41
|
+
lines = lines.map(®exp_line_stripper(strip))
|
42
|
+
end
|
43
|
+
Regexp.new(lines.join("\n"), regexp.options)
|
44
|
+
end
|
45
|
+
|
46
|
+
def regexp_line_stripper(strip)
|
47
|
+
case strip
|
48
|
+
when :left then ->(s) { s.lstrip }
|
49
|
+
when :right then ->(s) { s.rstrip }
|
50
|
+
when :both then ->(s) { s.strip }
|
51
|
+
when nil, false then ->(s) { s }
|
52
|
+
else raise "Unknown strip option: #{strip}"
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
28
56
|
def expand_regexp(regexp, directives)
|
29
57
|
if directives
|
30
58
|
TextExtractor.expand_directives(regexp)
|
@@ -33,6 +61,21 @@ class TextExtractor
|
|
33
61
|
end
|
34
62
|
end
|
35
63
|
|
64
|
+
def ignore_regexp(regexp, strip)
|
65
|
+
return regexp unless strip
|
66
|
+
lines = regexp.source.split("\n").map(®exp_line_ignorer(strip))
|
67
|
+
Regexp.new(lines.join("\n"), regexp.options)
|
68
|
+
end
|
69
|
+
|
70
|
+
def regexp_line_ignorer(strip)
|
71
|
+
case strip
|
72
|
+
when :left then ->(s) { "\[ \\t\\r\\f]*#{s}" }
|
73
|
+
when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
|
74
|
+
when :both then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
|
75
|
+
else raise "Unknown ignore whitespace option: #{strip}"
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
36
79
|
def match(string, pos = 0)
|
37
80
|
@regexp.match(string, pos)
|
38
81
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|