text_extractor 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 536459e92aed84134a79cc83b43582806dc241b7c6e1f634da83f33cbc204fb9
4
- data.tar.gz: 7840132fc524f8c6a5ec564e5e99a53adb58854e18fc292265f12a149dfd0c44
3
+ metadata.gz: bc66aa843889a7f5396d26c41d7756fb56b2157563d7aa8640732867d32750c4
4
+ data.tar.gz: d491b948b0baece51042436d5d9d934d6800e26255bbf4a73f8f815b1aedda44
5
5
  SHA512:
6
- metadata.gz: 9ba4320ef35cef8e1a313c37cb800c9ea2dd716564f0105f94839f48d03ca17e02a53a96cadc40aa9475113cb48e87f866f9f56c7a75e7bed84b94db15e9c741
7
- data.tar.gz: acebe63b3cedf8c1c1e4acca41030ffb86d1782fc71612e6f4200a18d6e953b3e3d7f41f28da269a1fb579e9e93e11b406b32e51a9d211d057156d2b2ecd5884
6
+ metadata.gz: 4111f9a6090fb6314fea8e164b9bd6fedb669bdfd8abe8031cbfbedc3c0e52df83f7ff2c0377bf00aa53340ee8423c7ab7d203d770114d1a756d09b4d0869b29
7
+ data.tar.gz: 48fa9d25532211f7473cd9535ad3740fe34cc79fc0f0628df35399c7711a303d16f5cc0182367fdfb4968bd1e47e6a608fa5e36d239ae9738d92afc8a232d221
@@ -8,37 +8,53 @@ require_relative 'text_extractor/inline_value'
8
8
 
9
9
  # represents an extractor definition
10
10
  class TextExtractor
11
+ @append_newline = false
12
+
13
+ singleton_class.instance_eval do
14
+ attr_accessor :append_newline
15
+ end
16
+
11
17
  attr_reader :records, :values
12
18
 
13
- # rubocop: disable Metrics/MethodLength
14
19
  def initialize(&block)
15
20
  raise "#{self.class}.new requires a block" unless block
21
+
22
+ initialize_options
23
+ initialize_collections
24
+ instance_exec(&block)
25
+ @append_guards.each { |g| guard(**g, &g[:block]) }
26
+ end
27
+
28
+ def initialize_options
29
+ @factory = nil
30
+ @section_delimiter = nil
31
+ @section_terminator = nil
32
+ @strip = nil
33
+ @append_newline = nil
34
+ end
35
+
36
+ def initialize_collections
16
37
  @values = {}
17
38
  @fill = {}
18
39
  @values = {}
19
40
  @records = []
20
41
  @filldowns = []
21
42
  @current_record_values = []
22
- @section_delimiter = nil
23
- @section_terminator = nil
24
43
  @append_guards = []
25
- instance_exec(&block)
26
- @append_guards.each { |g| guard(**g, &g[:block]) }
27
44
  end
28
- # rubocop: enable Metrics/MethodLength
29
45
 
30
46
  module Patterns
31
- INTEGER = /\d+/
32
- FLOAT = /\d+\.?|\d*\.\d+/
33
- RATIONAL = %r{\d+/\d+}
34
- IPV4 = /[0-9.]{7,15}/
35
- IPV6 = /[:a-fA-F0-9\.]{2,45}/
47
+ INTEGER = /\d+/.freeze
48
+ FLOAT = /\d+\.?|\d*\.\d+/.freeze
49
+ RATIONAL = %r{\d+/\d+}.freeze
50
+ IPV4 = /[0-9.]{7,15}/.freeze
51
+ IPV6 = /[:a-fA-F0-9\.]{2,45}/.freeze
36
52
  IPADDR = Regexp.union(IPV4, IPV6)
37
- IPV4_NET = %r{#{IPV4}/\d{1,2}}
38
- IPV6_NET = %r{#{IPV6}\/\d{1,3}}
53
+ IPV4_NET = %r{#{IPV4}/\d{1,2}}.freeze
54
+ IPV6_NET = %r{#{IPV6}\/\d{1,3}}.freeze
39
55
  IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
40
- TRUE = /y|yes|t|true|on/i
41
- FALSE = /n|no|f|false|off/i
56
+ TRUE = /y|yes|t|true|on/i.freeze
57
+ FALSE = /n|no|f|false|off/i.freeze
42
58
  BOOLEAN = Regexp.union(TRUE, FALSE)
43
59
  end
44
60
 
@@ -78,8 +94,16 @@ class TextExtractor
78
94
  value(id, re) { |val| IPAddr.new(val) }
79
95
  end
80
96
 
97
+ def append_newline(activate = nil)
98
+ return TextExtractor.append_newline if activate.nil? && @append_newline.nil?
99
+ return @append_newline if activate.nil?
100
+
101
+ @append_newline = activate
102
+ end
103
+
81
104
  def record(klass = Record, **kwargs, &block)
82
105
  raise "#{self.class}.record requires a block" unless block
106
+
83
107
  kwargs[:extractor_values] = values
84
108
  kwargs[:factory] ||= @factory if @factory
85
109
  kwargs[:values] = @current_record_values = []
@@ -91,6 +115,17 @@ class TextExtractor
91
115
  @section_terminator = terminator
92
116
  end
93
117
 
118
+ STRIP_PROCS = {
119
+ left: ->(s) { s.split("\n").map(&:lstrip).join("\n") + "\n" },
120
+ right: ->(s) { s.split("\n").map(&:rstrip).join("\n") + "\n" },
121
+ both: ->(s) { s.split("\n").map(&:strip).join("\n") + "\n" }
122
+ }.freeze
123
+
124
+ def strip(side = nil)
125
+ @strip = STRIP_PROCS[side] ||
126
+ (raise ArgumentError, 'Unknown strip option')
127
+ end
128
+
94
129
  def factory(object = nil)
95
130
  if object
96
131
  @factory = object
@@ -101,6 +136,7 @@ class TextExtractor
101
136
 
102
137
  def filldown(**kwargs, &block)
103
138
  raise "#{self.class}.filldown requires a block" unless block
139
+
104
140
  record(Filldown, **kwargs, &block)
105
141
  end
106
142
 
@@ -110,6 +146,7 @@ class TextExtractor
110
146
 
111
147
  def guard(**kwargs, &block)
112
148
  raise "#{self.class}.guard requires a block" unless block
149
+
113
150
  record(Guard, **kwargs, &block)
114
151
  end
115
152
 
@@ -119,6 +156,8 @@ class TextExtractor
119
156
  end
120
157
 
121
158
  def scan(input)
159
+ input = @strip.call(input) if @strip
160
+ input += "\n" if append_newline && !input.end_with?("\n")
122
161
  prefill = {}
123
162
  sections(input).flat_map { |section|
124
163
  Extraction.new(section, self, prefill).scan.extraction_matches
@@ -136,6 +175,7 @@ class TextExtractor
136
175
 
137
176
  def skip(**kwargs, &block)
138
177
  raise "#{self.class}.skip requires a block" unless block
178
+
139
179
  record(Skip, **kwargs, &block)
140
180
  end
141
181
 
@@ -31,10 +31,12 @@ class TextExtractor
31
31
 
32
32
  def expand
33
33
  return @output if @output
34
+
34
35
  @state = State.new
35
36
  scanner = StringScanner.new(@source)
36
37
  read_line(scanner) until scanner.eos?
37
38
  raise 'Unterminated line group' unless @state.groups.empty?
39
+
38
40
  @output = Regexp.new(@state.target.join(''), @options)
39
41
  end
40
42
 
@@ -45,14 +47,14 @@ class TextExtractor
45
47
  private
46
48
 
47
49
  DIRECTIVE_MAP = {
48
- ' ' => { class: Comment },
49
- 'any' => { class: Any },
50
- 'begin' => { class: Begin, arguments: :parsed },
50
+ ' ' => { class: Comment },
51
+ 'any' => { class: Any },
52
+ 'begin' => { class: Begin, arguments: :parsed },
51
53
  'capture' => { class: Capture, arguments: :parsed },
52
- 'end' => { class: End },
53
- 'maybe' => { class: Maybe },
54
+ 'end' => { class: End },
55
+ 'maybe' => { class: Maybe },
54
56
  'repeat' => { class: Repeat, arguments: :parse },
55
- 'rest' => { class: Rest }
57
+ 'rest' => { class: Rest }
56
58
  }.freeze
57
59
  private_constant :DIRECTIVE_MAP
58
60
 
@@ -97,6 +99,7 @@ class TextExtractor
97
99
 
98
100
  def parse_directives(full_source)
99
101
  return [Comment.new(@state)] if full_source.start_with?(' ')
102
+
100
103
  split_directives(full_source)
101
104
  .map { |source| parse_one_directive(source) }
102
105
  .each { |directive| @directives << directive }
@@ -105,6 +108,7 @@ class TextExtractor
105
108
  def parse_one_directive(source)
106
109
  md = source.match(/^[a-z_]+/) || source.match(/^ /)
107
110
  raise "Unknown directive(s) in #{@state.current_line}" unless md
111
+
108
112
  word = md[0]
109
113
  map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
110
114
  args = parse_arguments(map[:arguments], md.post_match)
@@ -118,6 +122,7 @@ class TextExtractor
118
122
  def parse_arguments(rule, source)
119
123
  return [] unless rule
120
124
  return rule.call(source) if rule.is_a?(Proc)
125
+
121
126
  source.match(/\(([^)]*)\)/) { |md| md[1] }
122
127
  end
123
128
  end # class Expander
@@ -17,6 +17,7 @@ class TextExtractor
17
17
 
18
18
  def chomp(newline)
19
19
  return if @lines.empty? || newline
20
+
20
21
  tail = @lines[-1]
21
22
  if tail.is_a?(Array)
22
23
  tail = tail[-1] while tail[-1].is_a?(Array)
@@ -27,6 +27,7 @@ class TextExtractor
27
27
  loop do
28
28
  match = input.match(re, pos)
29
29
  break unless match
30
+
30
31
  @pos = match.end(0)
31
32
  @matches << match
32
33
  end
@@ -4,17 +4,15 @@ class TextExtractor
4
4
  class Record
5
5
  attr_reader :regexp, :factory, :values
6
6
 
7
- # rubocop: disable Metrics/ParameterLists
8
7
  def initialize(
9
- regexp,
10
- factory: nil,
11
- values: [],
12
- fill: [],
13
- directives: true,
14
- inline: [],
15
- extractor_values: {},
16
- strip: nil,
17
- **_kwargs
8
+ regexp,
9
+ factory: nil,
10
+ values: [],
11
+ fill: [],
12
+ directives: true,
13
+ inline: [],
14
+ extractor_values: {},
15
+ **_kwargs
18
16
  )
19
17
  @factory = factory
20
18
  @constructor = FactoryAnalyzer.new(factory).to_proc
@@ -22,10 +20,9 @@ class TextExtractor
22
20
  @values = values.map { |val| [val.id, val] }.to_h
23
21
  initialize_inline_values(inline)
24
22
  @default_values = values.map { |val| [val.id, nil] }.to_h
25
- @regexp = build_regexp(regexp, directives, strip)
23
+ @regexp = build_regexp(regexp, directives)
26
24
  @fill = Array(fill)
27
25
  end
28
- # rubocop: enable Metrics/ParameterLists
29
26
 
30
27
  # @return Array
31
28
  def extraction(match, fill)
@@ -37,36 +34,29 @@ class TextExtractor
37
34
 
38
35
  def build_extraction(extracted)
39
36
  return extracted unless @constructor
37
+
40
38
  @constructor.call(extracted)
41
39
  end
42
40
 
43
- def build_regexp(regexp, directives, strip)
44
- stripped = strip_regexp(regexp, strip)
45
- expanded = expand_regexp(stripped, directives)
46
- ignore_regexp(expanded, strip)
41
+ def build_regexp(regexp, directives)
42
+ stripped = strip_regexp(regexp)
43
+ final = expand_regexp(stripped, directives)
44
+
45
+ raise EmptyRecordError, 'Empty record detected' if final =~ ''
46
+
47
+ final
47
48
  end
48
49
 
49
- def strip_regexp(regexp, strip)
50
+ def strip_regexp(regexp)
50
51
  lines = regexp.source.split("\n")
51
52
  prefix = lines.last
52
53
  if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
53
54
  lines.shift
54
55
  lines = lines.map { |s| s.gsub(prefix, '') }
55
- lines = lines.map(&regexp_line_stripper(strip))
56
56
  end
57
57
  Regexp.new(lines.join("\n"), regexp.options)
58
58
  end
59
59
 
60
- def regexp_line_stripper(strip)
61
- case strip
62
- when :left then ->(s) { s.lstrip }
63
- when :right then ->(s) { s.rstrip }
64
- when :both then ->(s) { s.strip }
65
- when nil, false then ->(s) { s }
66
- else raise "Unknown strip option: #{strip}"
67
- end
68
- end
69
-
70
60
  def expand_regexp(regexp, directives)
71
61
  if directives
72
62
  expander = Directives.new(regexp)
@@ -80,21 +70,6 @@ class TextExtractor
80
70
  end
81
71
  end
82
72
 
83
- def ignore_regexp(regexp, strip)
84
- return regexp unless strip
85
- lines = regexp.source.split("\n").map(&regexp_line_ignorer(strip))
86
- Regexp.new(lines.join("\n"), regexp.options)
87
- end
88
-
89
- def regexp_line_ignorer(strip)
90
- case strip
91
- when :left then ->(s) { "\[ \\t\\r\\f]*#{s}" }
92
- when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
93
- when :both then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
94
- else raise "Unknown ignore whitespace option: #{strip}"
95
- end
96
- end
97
-
98
73
  def match(string, pos = 0)
99
74
  @regexp.match(string, pos)
100
75
  end
@@ -178,4 +153,6 @@ class TextExtractor
178
153
  end
179
154
  end # class FactoryAnalyzer
180
155
  end # class Record
156
+
157
+ class EmptyRecordError < StandardError; end
181
158
  end # class TextExtractor
@@ -1,5 +1,5 @@
1
1
  class TextExtractor
2
2
  def self.version
3
- '0.5.2'
3
+ '0.6.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Miller
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-08-15 00:00:00.000000000 Z
11
+ date: 2020-04-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: minitest
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -44,28 +44,28 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: '13.0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: '13.0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rubocop
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0.54'
61
+ version: '0.82'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0.54'
68
+ version: '0.82'
69
69
  description:
70
70
  email: bjmllr@gmail.com
71
71
  executables: []
@@ -104,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
104
104
  version: '0'
105
105
  requirements: []
106
106
  rubyforge_project:
107
- rubygems_version: 2.7.3
107
+ rubygems_version: 2.7.6.2
108
108
  signing_key:
109
109
  specification_version: 4
110
110
  summary: Easily extract data from text