text_extractor 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_extractor.rb +55 -15
- data/lib/text_extractor/directives.rb +11 -6
- data/lib/text_extractor/directives/group.rb +1 -0
- data/lib/text_extractor/extraction.rb +1 -0
- data/lib/text_extractor/record.rb +20 -43
- data/lib/text_extractor/version.rb +1 -1
- metadata +9 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bc66aa843889a7f5396d26c41d7756fb56b2157563d7aa8640732867d32750c4
|
4
|
+
data.tar.gz: d491b948b0baece51042436d5d9d934d6800e26255bbf4a73f8f815b1aedda44
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4111f9a6090fb6314fea8e164b9bd6fedb669bdfd8abe8031cbfbedc3c0e52df83f7ff2c0377bf00aa53340ee8423c7ab7d203d770114d1a756d09b4d0869b29
|
7
|
+
data.tar.gz: 48fa9d25532211f7473cd9535ad3740fe34cc79fc0f0628df35399c7711a303d16f5cc0182367fdfb4968bd1e47e6a608fa5e36d239ae9738d92afc8a232d221
|
data/lib/text_extractor.rb
CHANGED
@@ -8,37 +8,53 @@ require_relative 'text_extractor/inline_value'
|
|
8
8
|
|
9
9
|
# represents an extractor definition
|
10
10
|
class TextExtractor
|
11
|
+
@append_newline = false
|
12
|
+
|
13
|
+
singleton_class.instance_eval do
|
14
|
+
attr_accessor :append_newline
|
15
|
+
end
|
16
|
+
|
11
17
|
attr_reader :records, :values
|
12
18
|
|
13
|
-
# rubocop: disable Metrics/MethodLength
|
14
19
|
def initialize(&block)
|
15
20
|
raise "#{self.class}.new requires a block" unless block
|
21
|
+
|
22
|
+
initialize_options
|
23
|
+
initialize_collections
|
24
|
+
instance_exec(&block)
|
25
|
+
@append_guards.each { |g| guard(**g, &g[:block]) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def initialize_options
|
29
|
+
@factory = nil
|
30
|
+
@section_delimiter = nil
|
31
|
+
@section_terminator = nil
|
32
|
+
@strip = nil
|
33
|
+
@append_newline = nil
|
34
|
+
end
|
35
|
+
|
36
|
+
def initialize_collections
|
16
37
|
@values = {}
|
17
38
|
@fill = {}
|
18
39
|
@values = {}
|
19
40
|
@records = []
|
20
41
|
@filldowns = []
|
21
42
|
@current_record_values = []
|
22
|
-
@section_delimiter = nil
|
23
|
-
@section_terminator = nil
|
24
43
|
@append_guards = []
|
25
|
-
instance_exec(&block)
|
26
|
-
@append_guards.each { |g| guard(**g, &g[:block]) }
|
27
44
|
end
|
28
|
-
# rubocop: enable Metrics/MethodLength
|
29
45
|
|
30
46
|
module Patterns
|
31
|
-
INTEGER = /\d
|
32
|
-
FLOAT = /\d+\.?|\d*\.\d
|
33
|
-
RATIONAL = %r{\d+/\d+}
|
34
|
-
IPV4 = /[0-9.]{7,15}
|
35
|
-
IPV6 = /[:a-fA-F0-9\.]{2,45}
|
47
|
+
INTEGER = /\d+/.freeze
|
48
|
+
FLOAT = /\d+\.?|\d*\.\d+/.freeze
|
49
|
+
RATIONAL = %r{\d+/\d+}.freeze
|
50
|
+
IPV4 = /[0-9.]{7,15}/.freeze
|
51
|
+
IPV6 = /[:a-fA-F0-9\.]{2,45}/.freeze
|
36
52
|
IPADDR = Regexp.union(IPV4, IPV6)
|
37
|
-
IPV4_NET = %r{#{IPV4}/\d{1,2}}
|
38
|
-
IPV6_NET = %r{#{IPV6}\/\d{1,3}}
|
53
|
+
IPV4_NET = %r{#{IPV4}/\d{1,2}}.freeze
|
54
|
+
IPV6_NET = %r{#{IPV6}\/\d{1,3}}.freeze
|
39
55
|
IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
|
40
|
-
TRUE = /y|yes|t|true|on/i
|
41
|
-
FALSE = /n|no|f|false|off/i
|
56
|
+
TRUE = /y|yes|t|true|on/i.freeze
|
57
|
+
FALSE = /n|no|f|false|off/i.freeze
|
42
58
|
BOOLEAN = Regexp.union(TRUE, FALSE)
|
43
59
|
end
|
44
60
|
|
@@ -78,8 +94,16 @@ class TextExtractor
|
|
78
94
|
value(id, re) { |val| IPAddr.new(val) }
|
79
95
|
end
|
80
96
|
|
97
|
+
def append_newline(activate = nil)
|
98
|
+
return TextExtractor.append_newline if activate.nil? && @append_newline.nil?
|
99
|
+
return @append_newline if activate.nil?
|
100
|
+
|
101
|
+
@append_newline = activate
|
102
|
+
end
|
103
|
+
|
81
104
|
def record(klass = Record, **kwargs, &block)
|
82
105
|
raise "#{self.class}.record requires a block" unless block
|
106
|
+
|
83
107
|
kwargs[:extractor_values] = values
|
84
108
|
kwargs[:factory] ||= @factory if @factory
|
85
109
|
kwargs[:values] = @current_record_values = []
|
@@ -91,6 +115,17 @@ class TextExtractor
|
|
91
115
|
@section_terminator = terminator
|
92
116
|
end
|
93
117
|
|
118
|
+
STRIP_PROCS = {
|
119
|
+
left: ->(s) { s.split("\n").map(&:lstrip).join("\n") + "\n" },
|
120
|
+
right: ->(s) { s.split("\n").map(&:rstrip).join("\n") + "\n" },
|
121
|
+
both: ->(s) { s.split("\n").map(&:strip).join("\n") + "\n" }
|
122
|
+
}.freeze
|
123
|
+
|
124
|
+
def strip(side = nil)
|
125
|
+
@strip = STRIP_PROCS[side] ||
|
126
|
+
(raise ArgumentError, 'Unknown strip option')
|
127
|
+
end
|
128
|
+
|
94
129
|
def factory(object = nil)
|
95
130
|
if object
|
96
131
|
@factory = object
|
@@ -101,6 +136,7 @@ class TextExtractor
|
|
101
136
|
|
102
137
|
def filldown(**kwargs, &block)
|
103
138
|
raise "#{self.class}.filldown requires a block" unless block
|
139
|
+
|
104
140
|
record(Filldown, **kwargs, &block)
|
105
141
|
end
|
106
142
|
|
@@ -110,6 +146,7 @@ class TextExtractor
|
|
110
146
|
|
111
147
|
def guard(**kwargs, &block)
|
112
148
|
raise "#{self.class}.guard requires a block" unless block
|
149
|
+
|
113
150
|
record(Guard, **kwargs, &block)
|
114
151
|
end
|
115
152
|
|
@@ -119,6 +156,8 @@ class TextExtractor
|
|
119
156
|
end
|
120
157
|
|
121
158
|
def scan(input)
|
159
|
+
input = @strip.call(input) if @strip
|
160
|
+
input += "\n" if append_newline && !input.end_with?("\n")
|
122
161
|
prefill = {}
|
123
162
|
sections(input).flat_map { |section|
|
124
163
|
Extraction.new(section, self, prefill).scan.extraction_matches
|
@@ -136,6 +175,7 @@ class TextExtractor
|
|
136
175
|
|
137
176
|
def skip(**kwargs, &block)
|
138
177
|
raise "#{self.class}.skip requires a block" unless block
|
178
|
+
|
139
179
|
record(Skip, **kwargs, &block)
|
140
180
|
end
|
141
181
|
|
@@ -31,10 +31,12 @@ class TextExtractor
|
|
31
31
|
|
32
32
|
def expand
|
33
33
|
return @output if @output
|
34
|
+
|
34
35
|
@state = State.new
|
35
36
|
scanner = StringScanner.new(@source)
|
36
37
|
read_line(scanner) until scanner.eos?
|
37
38
|
raise 'Unterminated line group' unless @state.groups.empty?
|
39
|
+
|
38
40
|
@output = Regexp.new(@state.target.join(''), @options)
|
39
41
|
end
|
40
42
|
|
@@ -45,14 +47,14 @@ class TextExtractor
|
|
45
47
|
private
|
46
48
|
|
47
49
|
DIRECTIVE_MAP = {
|
48
|
-
' '
|
49
|
-
'any'
|
50
|
-
'begin'
|
50
|
+
' ' => { class: Comment },
|
51
|
+
'any' => { class: Any },
|
52
|
+
'begin' => { class: Begin, arguments: :parsed },
|
51
53
|
'capture' => { class: Capture, arguments: :parsed },
|
52
|
-
'end'
|
53
|
-
'maybe'
|
54
|
+
'end' => { class: End },
|
55
|
+
'maybe' => { class: Maybe },
|
54
56
|
'repeat' => { class: Repeat, arguments: :parse },
|
55
|
-
'rest'
|
57
|
+
'rest' => { class: Rest }
|
56
58
|
}.freeze
|
57
59
|
private_constant :DIRECTIVE_MAP
|
58
60
|
|
@@ -97,6 +99,7 @@ class TextExtractor
|
|
97
99
|
|
98
100
|
def parse_directives(full_source)
|
99
101
|
return [Comment.new(@state)] if full_source.start_with?(' ')
|
102
|
+
|
100
103
|
split_directives(full_source)
|
101
104
|
.map { |source| parse_one_directive(source) }
|
102
105
|
.each { |directive| @directives << directive }
|
@@ -105,6 +108,7 @@ class TextExtractor
|
|
105
108
|
def parse_one_directive(source)
|
106
109
|
md = source.match(/^[a-z_]+/) || source.match(/^ /)
|
107
110
|
raise "Unknown directive(s) in #{@state.current_line}" unless md
|
111
|
+
|
108
112
|
word = md[0]
|
109
113
|
map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
|
110
114
|
args = parse_arguments(map[:arguments], md.post_match)
|
@@ -118,6 +122,7 @@ class TextExtractor
|
|
118
122
|
def parse_arguments(rule, source)
|
119
123
|
return [] unless rule
|
120
124
|
return rule.call(source) if rule.is_a?(Proc)
|
125
|
+
|
121
126
|
source.match(/\(([^)]*)\)/) { |md| md[1] }
|
122
127
|
end
|
123
128
|
end # class Expander
|
@@ -4,17 +4,15 @@ class TextExtractor
|
|
4
4
|
class Record
|
5
5
|
attr_reader :regexp, :factory, :values
|
6
6
|
|
7
|
-
# rubocop: disable Metrics/ParameterLists
|
8
7
|
def initialize(
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
**_kwargs
|
8
|
+
regexp,
|
9
|
+
factory: nil,
|
10
|
+
values: [],
|
11
|
+
fill: [],
|
12
|
+
directives: true,
|
13
|
+
inline: [],
|
14
|
+
extractor_values: {},
|
15
|
+
**_kwargs
|
18
16
|
)
|
19
17
|
@factory = factory
|
20
18
|
@constructor = FactoryAnalyzer.new(factory).to_proc
|
@@ -22,10 +20,9 @@ class TextExtractor
|
|
22
20
|
@values = values.map { |val| [val.id, val] }.to_h
|
23
21
|
initialize_inline_values(inline)
|
24
22
|
@default_values = values.map { |val| [val.id, nil] }.to_h
|
25
|
-
@regexp = build_regexp(regexp, directives
|
23
|
+
@regexp = build_regexp(regexp, directives)
|
26
24
|
@fill = Array(fill)
|
27
25
|
end
|
28
|
-
# rubocop: enable Metrics/ParameterLists
|
29
26
|
|
30
27
|
# @return Array
|
31
28
|
def extraction(match, fill)
|
@@ -37,36 +34,29 @@ class TextExtractor
|
|
37
34
|
|
38
35
|
def build_extraction(extracted)
|
39
36
|
return extracted unless @constructor
|
37
|
+
|
40
38
|
@constructor.call(extracted)
|
41
39
|
end
|
42
40
|
|
43
|
-
def build_regexp(regexp, directives
|
44
|
-
stripped = strip_regexp(regexp
|
45
|
-
|
46
|
-
|
41
|
+
def build_regexp(regexp, directives)
|
42
|
+
stripped = strip_regexp(regexp)
|
43
|
+
final = expand_regexp(stripped, directives)
|
44
|
+
|
45
|
+
raise EmptyRecordError, 'Empty record detected' if final =~ ''
|
46
|
+
|
47
|
+
final
|
47
48
|
end
|
48
49
|
|
49
|
-
def strip_regexp(regexp
|
50
|
+
def strip_regexp(regexp)
|
50
51
|
lines = regexp.source.split("\n")
|
51
52
|
prefix = lines.last
|
52
53
|
if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
|
53
54
|
lines.shift
|
54
55
|
lines = lines.map { |s| s.gsub(prefix, '') }
|
55
|
-
lines = lines.map(®exp_line_stripper(strip))
|
56
56
|
end
|
57
57
|
Regexp.new(lines.join("\n"), regexp.options)
|
58
58
|
end
|
59
59
|
|
60
|
-
def regexp_line_stripper(strip)
|
61
|
-
case strip
|
62
|
-
when :left then ->(s) { s.lstrip }
|
63
|
-
when :right then ->(s) { s.rstrip }
|
64
|
-
when :both then ->(s) { s.strip }
|
65
|
-
when nil, false then ->(s) { s }
|
66
|
-
else raise "Unknown strip option: #{strip}"
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
60
|
def expand_regexp(regexp, directives)
|
71
61
|
if directives
|
72
62
|
expander = Directives.new(regexp)
|
@@ -80,21 +70,6 @@ class TextExtractor
|
|
80
70
|
end
|
81
71
|
end
|
82
72
|
|
83
|
-
def ignore_regexp(regexp, strip)
|
84
|
-
return regexp unless strip
|
85
|
-
lines = regexp.source.split("\n").map(®exp_line_ignorer(strip))
|
86
|
-
Regexp.new(lines.join("\n"), regexp.options)
|
87
|
-
end
|
88
|
-
|
89
|
-
def regexp_line_ignorer(strip)
|
90
|
-
case strip
|
91
|
-
when :left then ->(s) { "\[ \\t\\r\\f]*#{s}" }
|
92
|
-
when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
|
93
|
-
when :both then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
|
94
|
-
else raise "Unknown ignore whitespace option: #{strip}"
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
73
|
def match(string, pos = 0)
|
99
74
|
@regexp.match(string, pos)
|
100
75
|
end
|
@@ -178,4 +153,6 @@ class TextExtractor
|
|
178
153
|
end
|
179
154
|
end # class FactoryAnalyzer
|
180
155
|
end # class Record
|
156
|
+
|
157
|
+
class EmptyRecordError < StandardError; end
|
181
158
|
end # class TextExtractor
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: minitest
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,28 +44,28 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '13.0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '13.0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rubocop
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '0.
|
61
|
+
version: '0.82'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '0.
|
68
|
+
version: '0.82'
|
69
69
|
description:
|
70
70
|
email: bjmllr@gmail.com
|
71
71
|
executables: []
|
@@ -104,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
104
|
version: '0'
|
105
105
|
requirements: []
|
106
106
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.7.
|
107
|
+
rubygems_version: 2.7.6.2
|
108
108
|
signing_key:
|
109
109
|
specification_version: 4
|
110
110
|
summary: Easily extract data from text
|