text_extractor 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_extractor.rb +55 -15
- data/lib/text_extractor/directives.rb +11 -6
- data/lib/text_extractor/directives/group.rb +1 -0
- data/lib/text_extractor/extraction.rb +1 -0
- data/lib/text_extractor/record.rb +20 -43
- data/lib/text_extractor/version.rb +1 -1
- metadata +9 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bc66aa843889a7f5396d26c41d7756fb56b2157563d7aa8640732867d32750c4
|
4
|
+
data.tar.gz: d491b948b0baece51042436d5d9d934d6800e26255bbf4a73f8f815b1aedda44
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4111f9a6090fb6314fea8e164b9bd6fedb669bdfd8abe8031cbfbedc3c0e52df83f7ff2c0377bf00aa53340ee8423c7ab7d203d770114d1a756d09b4d0869b29
|
7
|
+
data.tar.gz: 48fa9d25532211f7473cd9535ad3740fe34cc79fc0f0628df35399c7711a303d16f5cc0182367fdfb4968bd1e47e6a608fa5e36d239ae9738d92afc8a232d221
|
data/lib/text_extractor.rb
CHANGED
@@ -8,37 +8,53 @@ require_relative 'text_extractor/inline_value'
|
|
8
8
|
|
9
9
|
# represents an extractor definition
|
10
10
|
class TextExtractor
|
11
|
+
@append_newline = false
|
12
|
+
|
13
|
+
singleton_class.instance_eval do
|
14
|
+
attr_accessor :append_newline
|
15
|
+
end
|
16
|
+
|
11
17
|
attr_reader :records, :values
|
12
18
|
|
13
|
-
# rubocop: disable Metrics/MethodLength
|
14
19
|
def initialize(&block)
|
15
20
|
raise "#{self.class}.new requires a block" unless block
|
21
|
+
|
22
|
+
initialize_options
|
23
|
+
initialize_collections
|
24
|
+
instance_exec(&block)
|
25
|
+
@append_guards.each { |g| guard(**g, &g[:block]) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def initialize_options
|
29
|
+
@factory = nil
|
30
|
+
@section_delimiter = nil
|
31
|
+
@section_terminator = nil
|
32
|
+
@strip = nil
|
33
|
+
@append_newline = nil
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize_collections
|
16
37
|
@values = {}
|
17
38
|
@fill = {}
|
18
39
|
@values = {}
|
19
40
|
@records = []
|
20
41
|
@filldowns = []
|
21
42
|
@current_record_values = []
|
22
|
-
@section_delimiter = nil
|
23
|
-
@section_terminator = nil
|
24
43
|
@append_guards = []
|
25
|
-
instance_exec(&block)
|
26
|
-
@append_guards.each { |g| guard(**g, &g[:block]) }
|
27
44
|
end
|
28
|
-
# rubocop: enable Metrics/MethodLength
|
29
45
|
|
30
46
|
module Patterns
|
31
|
-
INTEGER = /\d
|
32
|
-
FLOAT = /\d+\.?|\d*\.\d
|
33
|
-
RATIONAL = %r{\d+/\d+}
|
34
|
-
IPV4 = /[0-9.]{7,15}
|
35
|
-
IPV6 = /[:a-fA-F0-9\.]{2,45}
|
47
|
+
INTEGER = /\d+/.freeze
|
48
|
+
FLOAT = /\d+\.?|\d*\.\d+/.freeze
|
49
|
+
RATIONAL = %r{\d+/\d+}.freeze
|
50
|
+
IPV4 = /[0-9.]{7,15}/.freeze
|
51
|
+
IPV6 = /[:a-fA-F0-9\.]{2,45}/.freeze
|
36
52
|
IPADDR = Regexp.union(IPV4, IPV6)
|
37
|
-
IPV4_NET = %r{#{IPV4}/\d{1,2}}
|
38
|
-
IPV6_NET = %r{#{IPV6}\/\d{1,3}}
|
53
|
+
IPV4_NET = %r{#{IPV4}/\d{1,2}}.freeze
|
54
|
+
IPV6_NET = %r{#{IPV6}\/\d{1,3}}.freeze
|
39
55
|
IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
|
40
|
-
TRUE = /y|yes|t|true|on/i
|
41
|
-
FALSE = /n|no|f|false|off/i
|
56
|
+
TRUE = /y|yes|t|true|on/i.freeze
|
57
|
+
FALSE = /n|no|f|false|off/i.freeze
|
42
58
|
BOOLEAN = Regexp.union(TRUE, FALSE)
|
43
59
|
end
|
44
60
|
|
@@ -78,8 +94,16 @@ class TextExtractor
|
|
78
94
|
value(id, re) { |val| IPAddr.new(val) }
|
79
95
|
end
|
80
96
|
|
97
|
+
def append_newline(activate = nil)
|
98
|
+
return TextExtractor.append_newline if activate.nil? && @append_newline.nil?
|
99
|
+
return @append_newline if activate.nil?
|
100
|
+
|
101
|
+
@append_newline = activate
|
102
|
+
end
|
103
|
+
|
81
104
|
def record(klass = Record, **kwargs, &block)
|
82
105
|
raise "#{self.class}.record requires a block" unless block
|
106
|
+
|
83
107
|
kwargs[:extractor_values] = values
|
84
108
|
kwargs[:factory] ||= @factory if @factory
|
85
109
|
kwargs[:values] = @current_record_values = []
|
@@ -91,6 +115,17 @@ class TextExtractor
|
|
91
115
|
@section_terminator = terminator
|
92
116
|
end
|
93
117
|
|
118
|
+
STRIP_PROCS = {
|
119
|
+
left: ->(s) { s.split("\n").map(&:lstrip).join("\n") + "\n" },
|
120
|
+
right: ->(s) { s.split("\n").map(&:rstrip).join("\n") + "\n" },
|
121
|
+
both: ->(s) { s.split("\n").map(&:strip).join("\n") + "\n" }
|
122
|
+
}.freeze
|
123
|
+
|
124
|
+
def strip(side = nil)
|
125
|
+
@strip = STRIP_PROCS[side] ||
|
126
|
+
(raise ArgumentError, 'Unknown strip option')
|
127
|
+
end
|
128
|
+
|
94
129
|
def factory(object = nil)
|
95
130
|
if object
|
96
131
|
@factory = object
|
@@ -101,6 +136,7 @@ class TextExtractor
|
|
101
136
|
|
102
137
|
def filldown(**kwargs, &block)
|
103
138
|
raise "#{self.class}.filldown requires a block" unless block
|
139
|
+
|
104
140
|
record(Filldown, **kwargs, &block)
|
105
141
|
end
|
106
142
|
|
@@ -110,6 +146,7 @@ class TextExtractor
|
|
110
146
|
|
111
147
|
def guard(**kwargs, &block)
|
112
148
|
raise "#{self.class}.guard requires a block" unless block
|
149
|
+
|
113
150
|
record(Guard, **kwargs, &block)
|
114
151
|
end
|
115
152
|
|
@@ -119,6 +156,8 @@ class TextExtractor
|
|
119
156
|
end
|
120
157
|
|
121
158
|
def scan(input)
|
159
|
+
input = @strip.call(input) if @strip
|
160
|
+
input += "\n" if append_newline && !input.end_with?("\n")
|
122
161
|
prefill = {}
|
123
162
|
sections(input).flat_map { |section|
|
124
163
|
Extraction.new(section, self, prefill).scan.extraction_matches
|
@@ -136,6 +175,7 @@ class TextExtractor
|
|
136
175
|
|
137
176
|
def skip(**kwargs, &block)
|
138
177
|
raise "#{self.class}.skip requires a block" unless block
|
178
|
+
|
139
179
|
record(Skip, **kwargs, &block)
|
140
180
|
end
|
141
181
|
|
@@ -31,10 +31,12 @@ class TextExtractor
|
|
31
31
|
|
32
32
|
def expand
|
33
33
|
return @output if @output
|
34
|
+
|
34
35
|
@state = State.new
|
35
36
|
scanner = StringScanner.new(@source)
|
36
37
|
read_line(scanner) until scanner.eos?
|
37
38
|
raise 'Unterminated line group' unless @state.groups.empty?
|
39
|
+
|
38
40
|
@output = Regexp.new(@state.target.join(''), @options)
|
39
41
|
end
|
40
42
|
|
@@ -45,14 +47,14 @@ class TextExtractor
|
|
45
47
|
private
|
46
48
|
|
47
49
|
DIRECTIVE_MAP = {
|
48
|
-
' '
|
49
|
-
'any'
|
50
|
-
'begin'
|
50
|
+
' ' => { class: Comment },
|
51
|
+
'any' => { class: Any },
|
52
|
+
'begin' => { class: Begin, arguments: :parsed },
|
51
53
|
'capture' => { class: Capture, arguments: :parsed },
|
52
|
-
'end'
|
53
|
-
'maybe'
|
54
|
+
'end' => { class: End },
|
55
|
+
'maybe' => { class: Maybe },
|
54
56
|
'repeat' => { class: Repeat, arguments: :parse },
|
55
|
-
'rest'
|
57
|
+
'rest' => { class: Rest }
|
56
58
|
}.freeze
|
57
59
|
private_constant :DIRECTIVE_MAP
|
58
60
|
|
@@ -97,6 +99,7 @@ class TextExtractor
|
|
97
99
|
|
98
100
|
def parse_directives(full_source)
|
99
101
|
return [Comment.new(@state)] if full_source.start_with?(' ')
|
102
|
+
|
100
103
|
split_directives(full_source)
|
101
104
|
.map { |source| parse_one_directive(source) }
|
102
105
|
.each { |directive| @directives << directive }
|
@@ -105,6 +108,7 @@ class TextExtractor
|
|
105
108
|
def parse_one_directive(source)
|
106
109
|
md = source.match(/^[a-z_]+/) || source.match(/^ /)
|
107
110
|
raise "Unknown directive(s) in #{@state.current_line}" unless md
|
111
|
+
|
108
112
|
word = md[0]
|
109
113
|
map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
|
110
114
|
args = parse_arguments(map[:arguments], md.post_match)
|
@@ -118,6 +122,7 @@ class TextExtractor
|
|
118
122
|
def parse_arguments(rule, source)
|
119
123
|
return [] unless rule
|
120
124
|
return rule.call(source) if rule.is_a?(Proc)
|
125
|
+
|
121
126
|
source.match(/\(([^)]*)\)/) { |md| md[1] }
|
122
127
|
end
|
123
128
|
end # class Expander
|
@@ -4,17 +4,15 @@ class TextExtractor
|
|
4
4
|
class Record
|
5
5
|
attr_reader :regexp, :factory, :values
|
6
6
|
|
7
|
-
# rubocop: disable Metrics/ParameterLists
|
8
7
|
def initialize(
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
**_kwargs
|
8
|
+
regexp,
|
9
|
+
factory: nil,
|
10
|
+
values: [],
|
11
|
+
fill: [],
|
12
|
+
directives: true,
|
13
|
+
inline: [],
|
14
|
+
extractor_values: {},
|
15
|
+
**_kwargs
|
18
16
|
)
|
19
17
|
@factory = factory
|
20
18
|
@constructor = FactoryAnalyzer.new(factory).to_proc
|
@@ -22,10 +20,9 @@ class TextExtractor
|
|
22
20
|
@values = values.map { |val| [val.id, val] }.to_h
|
23
21
|
initialize_inline_values(inline)
|
24
22
|
@default_values = values.map { |val| [val.id, nil] }.to_h
|
25
|
-
@regexp = build_regexp(regexp, directives
|
23
|
+
@regexp = build_regexp(regexp, directives)
|
26
24
|
@fill = Array(fill)
|
27
25
|
end
|
28
|
-
# rubocop: enable Metrics/ParameterLists
|
29
26
|
|
30
27
|
# @return Array
|
31
28
|
def extraction(match, fill)
|
@@ -37,36 +34,29 @@ class TextExtractor
|
|
37
34
|
|
38
35
|
def build_extraction(extracted)
|
39
36
|
return extracted unless @constructor
|
37
|
+
|
40
38
|
@constructor.call(extracted)
|
41
39
|
end
|
42
40
|
|
43
|
-
def build_regexp(regexp, directives
|
44
|
-
stripped = strip_regexp(regexp
|
45
|
-
|
46
|
-
|
41
|
+
def build_regexp(regexp, directives)
|
42
|
+
stripped = strip_regexp(regexp)
|
43
|
+
final = expand_regexp(stripped, directives)
|
44
|
+
|
45
|
+
raise EmptyRecordError, 'Empty record detected' if final =~ ''
|
46
|
+
|
47
|
+
final
|
47
48
|
end
|
48
49
|
|
49
|
-
def strip_regexp(regexp
|
50
|
+
def strip_regexp(regexp)
|
50
51
|
lines = regexp.source.split("\n")
|
51
52
|
prefix = lines.last
|
52
53
|
if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
|
53
54
|
lines.shift
|
54
55
|
lines = lines.map { |s| s.gsub(prefix, '') }
|
55
|
-
lines = lines.map(®exp_line_stripper(strip))
|
56
56
|
end
|
57
57
|
Regexp.new(lines.join("\n"), regexp.options)
|
58
58
|
end
|
59
59
|
|
60
|
-
def regexp_line_stripper(strip)
|
61
|
-
case strip
|
62
|
-
when :left then ->(s) { s.lstrip }
|
63
|
-
when :right then ->(s) { s.rstrip }
|
64
|
-
when :both then ->(s) { s.strip }
|
65
|
-
when nil, false then ->(s) { s }
|
66
|
-
else raise "Unknown strip option: #{strip}"
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
60
|
def expand_regexp(regexp, directives)
|
71
61
|
if directives
|
72
62
|
expander = Directives.new(regexp)
|
@@ -80,21 +70,6 @@ class TextExtractor
|
|
80
70
|
end
|
81
71
|
end
|
82
72
|
|
83
|
-
def ignore_regexp(regexp, strip)
|
84
|
-
return regexp unless strip
|
85
|
-
lines = regexp.source.split("\n").map(®exp_line_ignorer(strip))
|
86
|
-
Regexp.new(lines.join("\n"), regexp.options)
|
87
|
-
end
|
88
|
-
|
89
|
-
def regexp_line_ignorer(strip)
|
90
|
-
case strip
|
91
|
-
when :left then ->(s) { "\[ \\t\\r\\f]*#{s}" }
|
92
|
-
when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
|
93
|
-
when :both then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
|
94
|
-
else raise "Unknown ignore whitespace option: #{strip}"
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
73
|
def match(string, pos = 0)
|
99
74
|
@regexp.match(string, pos)
|
100
75
|
end
|
@@ -178,4 +153,6 @@ class TextExtractor
|
|
178
153
|
end
|
179
154
|
end # class FactoryAnalyzer
|
180
155
|
end # class Record
|
156
|
+
|
157
|
+
class EmptyRecordError < StandardError; end
|
181
158
|
end # class TextExtractor
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: minitest
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,28 +44,28 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '13.0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '13.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rubocop
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0.
|
61
|
+
version: '0.82'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0.
|
68
|
+
version: '0.82'
|
69
69
|
description:
|
70
70
|
email: bjmllr@gmail.com
|
71
71
|
executables: []
|
@@ -104,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
104
|
version: '0'
|
105
105
|
requirements: []
|
106
106
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.7.
|
107
|
+
rubygems_version: 2.7.6.2
|
108
108
|
signing_key:
|
109
109
|
specification_version: 4
|
110
110
|
summary: Easily extract data from text
|