text_extractor 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 536459e92aed84134a79cc83b43582806dc241b7c6e1f634da83f33cbc204fb9
4
- data.tar.gz: 7840132fc524f8c6a5ec564e5e99a53adb58854e18fc292265f12a149dfd0c44
3
+ metadata.gz: bc66aa843889a7f5396d26c41d7756fb56b2157563d7aa8640732867d32750c4
4
+ data.tar.gz: d491b948b0baece51042436d5d9d934d6800e26255bbf4a73f8f815b1aedda44
5
5
  SHA512:
6
- metadata.gz: 9ba4320ef35cef8e1a313c37cb800c9ea2dd716564f0105f94839f48d03ca17e02a53a96cadc40aa9475113cb48e87f866f9f56c7a75e7bed84b94db15e9c741
7
- data.tar.gz: acebe63b3cedf8c1c1e4acca41030ffb86d1782fc71612e6f4200a18d6e953b3e3d7f41f28da269a1fb579e9e93e11b406b32e51a9d211d057156d2b2ecd5884
6
+ metadata.gz: 4111f9a6090fb6314fea8e164b9bd6fedb669bdfd8abe8031cbfbedc3c0e52df83f7ff2c0377bf00aa53340ee8423c7ab7d203d770114d1a756d09b4d0869b29
7
+ data.tar.gz: 48fa9d25532211f7473cd9535ad3740fe34cc79fc0f0628df35399c7711a303d16f5cc0182367fdfb4968bd1e47e6a608fa5e36d239ae9738d92afc8a232d221
@@ -8,37 +8,53 @@ require_relative 'text_extractor/inline_value'
8
8
 
9
9
  # represents an extractor definition
10
10
  class TextExtractor
11
+ @append_newline = false
12
+
13
+ singleton_class.instance_eval do
14
+ attr_accessor :append_newline
15
+ end
16
+
11
17
  attr_reader :records, :values
12
18
 
13
- # rubocop: disable Metrics/MethodLength
14
19
  def initialize(&block)
15
20
  raise "#{self.class}.new requires a block" unless block
21
+
22
+ initialize_options
23
+ initialize_collections
24
+ instance_exec(&block)
25
+ @append_guards.each { |g| guard(**g, &g[:block]) }
26
+ end
27
+
28
+ def initialize_options
29
+ @factory = nil
30
+ @section_delimiter = nil
31
+ @section_terminator = nil
32
+ @strip = nil
33
+ @append_newline = nil
34
+ end
35
+
36
+ def initialize_collections
16
37
  @values = {}
17
38
  @fill = {}
18
39
  @values = {}
19
40
  @records = []
20
41
  @filldowns = []
21
42
  @current_record_values = []
22
- @section_delimiter = nil
23
- @section_terminator = nil
24
43
  @append_guards = []
25
- instance_exec(&block)
26
- @append_guards.each { |g| guard(**g, &g[:block]) }
27
44
  end
28
- # rubocop: enable Metrics/MethodLength
29
45
 
30
46
  module Patterns
31
- INTEGER = /\d+/
32
- FLOAT = /\d+\.?|\d*\.\d+/
33
- RATIONAL = %r{\d+/\d+}
34
- IPV4 = /[0-9.]{7,15}/
35
- IPV6 = /[:a-fA-F0-9\.]{2,45}/
47
+ INTEGER = /\d+/.freeze
48
+ FLOAT = /\d+\.?|\d*\.\d+/.freeze
49
+ RATIONAL = %r{\d+/\d+}.freeze
50
+ IPV4 = /[0-9.]{7,15}/.freeze
51
+ IPV6 = /[:a-fA-F0-9\.]{2,45}/.freeze
36
52
  IPADDR = Regexp.union(IPV4, IPV6)
37
- IPV4_NET = %r{#{IPV4}/\d{1,2}}
38
- IPV6_NET = %r{#{IPV6}\/\d{1,3}}
53
+ IPV4_NET = %r{#{IPV4}/\d{1,2}}.freeze
54
+ IPV6_NET = %r{#{IPV6}\/\d{1,3}}.freeze
39
55
  IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
40
- TRUE = /y|yes|t|true|on/i
41
- FALSE = /n|no|f|false|off/i
56
+ TRUE = /y|yes|t|true|on/i.freeze
57
+ FALSE = /n|no|f|false|off/i.freeze
42
58
  BOOLEAN = Regexp.union(TRUE, FALSE)
43
59
  end
44
60
 
@@ -78,8 +94,16 @@ class TextExtractor
78
94
  value(id, re) { |val| IPAddr.new(val) }
79
95
  end
80
96
 
97
+ def append_newline(activate = nil)
98
+ return TextExtractor.append_newline if activate.nil? && @append_newline.nil?
99
+ return @append_newline if activate.nil?
100
+
101
+ @append_newline = activate
102
+ end
103
+
81
104
  def record(klass = Record, **kwargs, &block)
82
105
  raise "#{self.class}.record requires a block" unless block
106
+
83
107
  kwargs[:extractor_values] = values
84
108
  kwargs[:factory] ||= @factory if @factory
85
109
  kwargs[:values] = @current_record_values = []
@@ -91,6 +115,17 @@ class TextExtractor
91
115
  @section_terminator = terminator
92
116
  end
93
117
 
118
+ STRIP_PROCS = {
119
+ left: ->(s) { s.split("\n").map(&:lstrip).join("\n") + "\n" },
120
+ right: ->(s) { s.split("\n").map(&:rstrip).join("\n") + "\n" },
121
+ both: ->(s) { s.split("\n").map(&:strip).join("\n") + "\n" }
122
+ }.freeze
123
+
124
+ def strip(side = nil)
125
+ @strip = STRIP_PROCS[side] ||
126
+ (raise ArgumentError, 'Unknown strip option')
127
+ end
128
+
94
129
  def factory(object = nil)
95
130
  if object
96
131
  @factory = object
@@ -101,6 +136,7 @@ class TextExtractor
101
136
 
102
137
  def filldown(**kwargs, &block)
103
138
  raise "#{self.class}.filldown requires a block" unless block
139
+
104
140
  record(Filldown, **kwargs, &block)
105
141
  end
106
142
 
@@ -110,6 +146,7 @@ class TextExtractor
110
146
 
111
147
  def guard(**kwargs, &block)
112
148
  raise "#{self.class}.guard requires a block" unless block
149
+
113
150
  record(Guard, **kwargs, &block)
114
151
  end
115
152
 
@@ -119,6 +156,8 @@ class TextExtractor
119
156
  end
120
157
 
121
158
  def scan(input)
159
+ input = @strip.call(input) if @strip
160
+ input += "\n" if append_newline && !input.end_with?("\n")
122
161
  prefill = {}
123
162
  sections(input).flat_map { |section|
124
163
  Extraction.new(section, self, prefill).scan.extraction_matches
@@ -136,6 +175,7 @@ class TextExtractor
136
175
 
137
176
  def skip(**kwargs, &block)
138
177
  raise "#{self.class}.skip requires a block" unless block
178
+
139
179
  record(Skip, **kwargs, &block)
140
180
  end
141
181
 
@@ -31,10 +31,12 @@ class TextExtractor
31
31
 
32
32
  def expand
33
33
  return @output if @output
34
+
34
35
  @state = State.new
35
36
  scanner = StringScanner.new(@source)
36
37
  read_line(scanner) until scanner.eos?
37
38
  raise 'Unterminated line group' unless @state.groups.empty?
39
+
38
40
  @output = Regexp.new(@state.target.join(''), @options)
39
41
  end
40
42
 
@@ -45,14 +47,14 @@ class TextExtractor
45
47
  private
46
48
 
47
49
  DIRECTIVE_MAP = {
48
- ' ' => { class: Comment },
49
- 'any' => { class: Any },
50
- 'begin' => { class: Begin, arguments: :parsed },
50
+ ' ' => { class: Comment },
51
+ 'any' => { class: Any },
52
+ 'begin' => { class: Begin, arguments: :parsed },
51
53
  'capture' => { class: Capture, arguments: :parsed },
52
- 'end' => { class: End },
53
- 'maybe' => { class: Maybe },
54
+ 'end' => { class: End },
55
+ 'maybe' => { class: Maybe },
54
56
  'repeat' => { class: Repeat, arguments: :parse },
55
- 'rest' => { class: Rest }
57
+ 'rest' => { class: Rest }
56
58
  }.freeze
57
59
  private_constant :DIRECTIVE_MAP
58
60
 
@@ -97,6 +99,7 @@ class TextExtractor
97
99
 
98
100
  def parse_directives(full_source)
99
101
  return [Comment.new(@state)] if full_source.start_with?(' ')
102
+
100
103
  split_directives(full_source)
101
104
  .map { |source| parse_one_directive(source) }
102
105
  .each { |directive| @directives << directive }
@@ -105,6 +108,7 @@ class TextExtractor
105
108
  def parse_one_directive(source)
106
109
  md = source.match(/^[a-z_]+/) || source.match(/^ /)
107
110
  raise "Unknown directive(s) in #{@state.current_line}" unless md
111
+
108
112
  word = md[0]
109
113
  map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
110
114
  args = parse_arguments(map[:arguments], md.post_match)
@@ -118,6 +122,7 @@ class TextExtractor
118
122
  def parse_arguments(rule, source)
119
123
  return [] unless rule
120
124
  return rule.call(source) if rule.is_a?(Proc)
125
+
121
126
  source.match(/\(([^)]*)\)/) { |md| md[1] }
122
127
  end
123
128
  end # class Expander
@@ -17,6 +17,7 @@ class TextExtractor
17
17
 
18
18
  def chomp(newline)
19
19
  return if @lines.empty? || newline
20
+
20
21
  tail = @lines[-1]
21
22
  if tail.is_a?(Array)
22
23
  tail = tail[-1] while tail[-1].is_a?(Array)
@@ -27,6 +27,7 @@ class TextExtractor
27
27
  loop do
28
28
  match = input.match(re, pos)
29
29
  break unless match
30
+
30
31
  @pos = match.end(0)
31
32
  @matches << match
32
33
  end
@@ -4,17 +4,15 @@ class TextExtractor
4
4
  class Record
5
5
  attr_reader :regexp, :factory, :values
6
6
 
7
- # rubocop: disable Metrics/ParameterLists
8
7
  def initialize(
9
- regexp,
10
- factory: nil,
11
- values: [],
12
- fill: [],
13
- directives: true,
14
- inline: [],
15
- extractor_values: {},
16
- strip: nil,
17
- **_kwargs
8
+ regexp,
9
+ factory: nil,
10
+ values: [],
11
+ fill: [],
12
+ directives: true,
13
+ inline: [],
14
+ extractor_values: {},
15
+ **_kwargs
18
16
  )
19
17
  @factory = factory
20
18
  @constructor = FactoryAnalyzer.new(factory).to_proc
@@ -22,10 +20,9 @@ class TextExtractor
22
20
  @values = values.map { |val| [val.id, val] }.to_h
23
21
  initialize_inline_values(inline)
24
22
  @default_values = values.map { |val| [val.id, nil] }.to_h
25
- @regexp = build_regexp(regexp, directives, strip)
23
+ @regexp = build_regexp(regexp, directives)
26
24
  @fill = Array(fill)
27
25
  end
28
- # rubocop: enable Metrics/ParameterLists
29
26
 
30
27
  # @return Array
31
28
  def extraction(match, fill)
@@ -37,36 +34,29 @@ class TextExtractor
37
34
 
38
35
  def build_extraction(extracted)
39
36
  return extracted unless @constructor
37
+
40
38
  @constructor.call(extracted)
41
39
  end
42
40
 
43
- def build_regexp(regexp, directives, strip)
44
- stripped = strip_regexp(regexp, strip)
45
- expanded = expand_regexp(stripped, directives)
46
- ignore_regexp(expanded, strip)
41
+ def build_regexp(regexp, directives)
42
+ stripped = strip_regexp(regexp)
43
+ final = expand_regexp(stripped, directives)
44
+
45
+ raise EmptyRecordError, 'Empty record detected' if final =~ ''
46
+
47
+ final
47
48
  end
48
49
 
49
- def strip_regexp(regexp, strip)
50
+ def strip_regexp(regexp)
50
51
  lines = regexp.source.split("\n")
51
52
  prefix = lines.last
52
53
  if lines.first =~ /\A\s*\z/ && prefix =~ /\A\s*\z/
53
54
  lines.shift
54
55
  lines = lines.map { |s| s.gsub(prefix, '') }
55
- lines = lines.map(&regexp_line_stripper(strip))
56
56
  end
57
57
  Regexp.new(lines.join("\n"), regexp.options)
58
58
  end
59
59
 
60
- def regexp_line_stripper(strip)
61
- case strip
62
- when :left then ->(s) { s.lstrip }
63
- when :right then ->(s) { s.rstrip }
64
- when :both then ->(s) { s.strip }
65
- when nil, false then ->(s) { s }
66
- else raise "Unknown strip option: #{strip}"
67
- end
68
- end
69
-
70
60
  def expand_regexp(regexp, directives)
71
61
  if directives
72
62
  expander = Directives.new(regexp)
@@ -80,21 +70,6 @@ class TextExtractor
80
70
  end
81
71
  end
82
72
 
83
- def ignore_regexp(regexp, strip)
84
- return regexp unless strip
85
- lines = regexp.source.split("\n").map(&regexp_line_ignorer(strip))
86
- Regexp.new(lines.join("\n"), regexp.options)
87
- end
88
-
89
- def regexp_line_ignorer(strip)
90
- case strip
91
- when :left then ->(s) { "\[ \\t\\r\\f]*#{s}" }
92
- when :right then ->(s) { "#{s}\[ \\t\\r\\f]*" }
93
- when :both then ->(s) { "\[ \\t\\r\\f]*#{s}\[ \\t\\r\\f]*" }
94
- else raise "Unknown ignore whitespace option: #{strip}"
95
- end
96
- end
97
-
98
73
  def match(string, pos = 0)
99
74
  @regexp.match(string, pos)
100
75
  end
@@ -178,4 +153,6 @@ class TextExtractor
178
153
  end
179
154
  end # class FactoryAnalyzer
180
155
  end # class Record
156
+
157
+ class EmptyRecordError < StandardError; end
181
158
  end # class TextExtractor
@@ -1,5 +1,5 @@
1
1
  class TextExtractor
2
2
  def self.version
3
- '0.5.2'
3
+ '0.6.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Miller
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-08-15 00:00:00.000000000 Z
11
+ date: 2020-04-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: minitest
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -44,28 +44,28 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: '13.0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: '13.0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rubocop
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0.54'
61
+ version: '0.82'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0.54'
68
+ version: '0.82'
69
69
  description:
70
70
  email: bjmllr@gmail.com
71
71
  executables: []
@@ -104,7 +104,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
104
104
  version: '0'
105
105
  requirements: []
106
106
  rubyforge_project:
107
- rubygems_version: 2.7.3
107
+ rubygems_version: 2.7.6.2
108
108
  signing_key:
109
109
  specification_version: 4
110
110
  summary: Easily extract data from text