text_extractor 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5412586591653945efffad2ce8783c9269e58582
4
- data.tar.gz: dc9fdb2f1eaad7bfafab31ad13e3e5eff966ea22
2
+ SHA256:
3
+ metadata.gz: 6d814b19dedf7312cee41b86de5ac98115240908b9a5ef17d343eb5450d2d608
4
+ data.tar.gz: d365e26b0ba0a97d5ec25854767248ceff60f91c0808a24b06cf2fb425b87864
5
5
  SHA512:
6
- metadata.gz: f8c4a71b402c49c136700eab64daeba10a48938285d1d06e1522bb8bbd47fe1cf5f6962f0d2dcc555fa413164185e50eed069a52ac6bc4b253b1dd973529d09b
7
- data.tar.gz: 73117018ffc3542a71aa201ac84ddc5d038881605e3a3fdccabce1d0d65a77137f2b731cac6a4463c8e3902c431fa418e166bcefe0cd6feedb899616a75a9e29
6
+ metadata.gz: e39cfa57ff12e5df3011e729627c37948b658c8c82447398c1547b94486b0ef46cd0b4f6491e598375010c3882ca23a2fa5288d407cddaebc88d74996712f9e7
7
+ data.tar.gz: 1a680465ce5f430100b01ec9b2fdc68f51813b280b03d886454f8c0c61d7cafee72a7097e53c2cedcf09d2cb399ff56931719978207e648cf6ef9d2401844022
@@ -1,6 +1,8 @@
1
1
  require_relative 'text_extractor/extraction'
2
2
  require_relative 'text_extractor/filldown'
3
+ require_relative 'text_extractor/guard'
3
4
  require_relative 'text_extractor/record'
5
+ require_relative 'text_extractor/skip'
4
6
  require_relative 'text_extractor/value'
5
7
  require_relative 'text_extractor/inline_value'
6
8
 
@@ -8,6 +10,7 @@ require_relative 'text_extractor/inline_value'
8
10
  class TextExtractor
9
11
  attr_reader :records, :values
10
12
 
13
+ # rubocop: disable Metrics/MethodLength
11
14
  def initialize(&block)
12
15
  raise "#{self.class}.new requires a block" unless block
13
16
  @values = {}
@@ -18,8 +21,11 @@ class TextExtractor
18
21
  @current_record_values = []
19
22
  @section_delimiter = nil
20
23
  @section_terminator = nil
24
+ @append_guards = []
21
25
  instance_exec(&block)
26
+ @append_guards.each { |g| guard(**g, &g[:block]) }
22
27
  end
28
+ # rubocop: enable Metrics/MethodLength
23
29
 
24
30
  module Patterns
25
31
  INTEGER = /\d+/
@@ -28,8 +34,8 @@ class TextExtractor
28
34
  IPV4 = /[0-9.]{7,15}/
29
35
  IPV6 = /[:a-fA-F0-9\.]{2,45}/
30
36
  IPADDR = Regexp.union(IPV4, IPV6)
31
- IPV4_NET = /#{IPV4}\/\d{1,2}/
32
- IPV6_NET = /#{IPV6}\/\d{1,3}/
37
+ IPV4_NET = %r{#{IPV4}/\d{1,2}}
38
+ IPV6_NET = %r{#{IPV6}\/\d{1,3}}
33
39
  IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
34
40
  TRUE = /y|yes|t|true|on/i
35
41
  FALSE = /n|no|f|false|off/i
@@ -75,6 +81,7 @@ class TextExtractor
75
81
  def record(klass = Record, **kwargs, &block)
76
82
  raise "#{self.class}.record requires a block" unless block
77
83
  kwargs[:extractor_values] = values
84
+ kwargs[:factory] ||= @factory if @factory
78
85
  kwargs[:values] = @current_record_values = []
79
86
  @records << klass.new(instance_exec(&block), **kwargs)
80
87
  end
@@ -84,6 +91,14 @@ class TextExtractor
84
91
  @section_terminator = terminator
85
92
  end
86
93
 
94
+ def factory(object = nil)
95
+ if object
96
+ @factory = object
97
+ else
98
+ @factory
99
+ end
100
+ end
101
+
87
102
  def filldown(**kwargs, &block)
88
103
  raise "#{self.class}.filldown requires a block" unless block
89
104
  record(Filldown, **kwargs, &block)
@@ -93,6 +108,16 @@ class TextExtractor
93
108
  records[records.length.times.find_index { |i| match["__#{i}"] }]
94
109
  end
95
110
 
111
+ def guard(**kwargs, &block)
112
+ raise "#{self.class}.guard requires a block" unless block
113
+ record(Guard, **kwargs, &block)
114
+ end
115
+
116
+ def guards(*guard_args)
117
+ guard_args = Guards::DEFAULT if guard_args.empty?
118
+ @append_guards = guard_args
119
+ end
120
+
96
121
  def scan(input)
97
122
  prefill = {}
98
123
  sections(input).flat_map { |section|
@@ -109,6 +134,11 @@ class TextExtractor
109
134
  texts.map { |section| section + @section_terminator }
110
135
  end
111
136
 
137
+ def skip(**kwargs, &block)
138
+ raise "#{self.class}.skip requires a block" unless block
139
+ record(Skip, **kwargs, &block)
140
+ end
141
+
112
142
  def regexps
113
143
  @records.map.with_index do |record, i|
114
144
  Regexp.new("(?<__#{i}>#{record.source})", record.options)
@@ -61,8 +61,7 @@ class TextExtractor
61
61
 
62
62
  # text that will be omitted from the regexp
63
63
  class Comment < Directive
64
- def call
65
- end
64
+ def call; end
66
65
  end
67
66
 
68
67
  # close a line group
@@ -0,0 +1,41 @@
1
+ require_relative 'record'
2
+
3
+ class TextExtractor
4
+ class Guard < Record
5
+ def initialize(_regexp, description:, **kwargs)
6
+ super
7
+ @description = description
8
+ @factory ||= :itself.to_proc
9
+ end
10
+
11
+ def extraction(match, _fill)
12
+ text = @factory.call(match[0])
13
+ raise GuardError, "#{@description} near #{text.inspect}"
14
+ end
15
+ end
16
+
17
+ INDENTED = {
18
+ description: 'indented line',
19
+ block: proc {
20
+ /
21
+ ^[^\n\S]+[^\n]*$
22
+ /
23
+ }
24
+ }.freeze
25
+
26
+ UNINDENTED = {
27
+ description: 'unindented line',
28
+ block: proc {
29
+ /
30
+ ^\S+[^\n]*$
31
+ /
32
+ }
33
+ }.freeze
34
+
35
+ DEFAULT = [
36
+ INDENTED,
37
+ UNINDENTED
38
+ ].freeze
39
+
40
+ class GuardError < StandardError; end
41
+ end # class TextExtractor
@@ -4,8 +4,18 @@ class TextExtractor
4
4
  class Record
5
5
  attr_reader :regexp, :factory, :values
6
6
 
7
- def initialize(regexp, factory: nil, values: [], fill: [], directives: true,
8
- inline: [], extractor_values: {}, strip: nil)
7
+ # rubocop: disable Metrics/ParameterLists
8
+ def initialize(
9
+ regexp,
10
+ factory: nil,
11
+ values: [],
12
+ fill: [],
13
+ directives: true,
14
+ inline: [],
15
+ extractor_values: {},
16
+ strip: nil,
17
+ **_kwargs
18
+ )
9
19
  @factory = factory
10
20
  @constructor = FactoryAnalyzer.new(factory).to_proc
11
21
  @extractor_values = extractor_values
@@ -15,12 +25,14 @@ class TextExtractor
15
25
  @regexp = build_regexp(regexp, directives, strip)
16
26
  @fill = Array(fill)
17
27
  end
28
+ # rubocop: enable Metrics/ParameterLists
18
29
 
30
+ # @return Array
19
31
  def extraction(match, fill)
20
32
  extracted = {}.merge!(@default_values)
21
- .merge!(extract_fills fill)
22
- .merge!(extract_values match)
23
- build_extraction(extracted)
33
+ .merge!(extract_fills(fill))
34
+ .merge!(extract_values(match))
35
+ [build_extraction(extracted)]
24
36
  end
25
37
 
26
38
  def build_extraction(extracted)
@@ -59,9 +71,9 @@ class TextExtractor
59
71
  if directives
60
72
  expander = Directives.new(regexp)
61
73
  expanded = expander.expand
62
- expander.values.each { |value|
74
+ expander.values.each do |value|
63
75
  values[value.id] = @extractor_values.fetch(value.id, value)
64
- }
76
+ end
65
77
  expanded
66
78
  else
67
79
  regexp
@@ -126,7 +138,7 @@ class TextExtractor
126
138
  def to_proc
127
139
  if @params
128
140
  explicit
129
- elsif @klass.is_a?(Proc)
141
+ elsif @klass.respond_to?(:call)
130
142
  @klass
131
143
  elsif @klass
132
144
  implicit
@@ -0,0 +1,9 @@
1
+ require_relative 'record'
2
+
3
+ class TextExtractor
4
+ class Skip < Record
5
+ def extraction(*)
6
+ []
7
+ end
8
+ end # class Skip < Record
9
+ end # class TextExtractor
@@ -10,9 +10,9 @@ class TextExtractor
10
10
 
11
11
  def convert(value)
12
12
  @block ? @block.call(value) : value
13
- rescue => e
13
+ rescue StandardError => e
14
14
  raise e.class,
15
- "in custom conversion of "\
15
+ 'in custom conversion of '\
16
16
  "value(#{id.inspect}, #{re.inspect}): #{e.message}"
17
17
  end
18
18
  end
@@ -1,5 +1,5 @@
1
1
  class TextExtractor
2
2
  def self.version
3
- '0.4.0'
3
+ '0.5.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Miller
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-28 00:00:00.000000000 Z
11
+ date: 2018-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '5.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '5.0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rake
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -39,19 +53,19 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '10.0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: minitest
56
+ name: rubocop
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - "~>"
46
60
  - !ruby/object:Gem::Version
47
- version: '5.0'
61
+ version: '0.54'
48
62
  type: :development
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
- version: '5.0'
68
+ version: '0.54'
55
69
  description:
56
70
  email: bjmllr@gmail.com
57
71
  executables: []
@@ -64,8 +78,10 @@ files:
64
78
  - lib/text_extractor/directives/group.rb
65
79
  - lib/text_extractor/extraction.rb
66
80
  - lib/text_extractor/filldown.rb
81
+ - lib/text_extractor/guard.rb
67
82
  - lib/text_extractor/inline_value.rb
68
83
  - lib/text_extractor/record.rb
84
+ - lib/text_extractor/skip.rb
69
85
  - lib/text_extractor/value.rb
70
86
  - lib/text_extractor/version.rb
71
87
  homepage: https://github.com/bjmllr/text_extractor
@@ -88,9 +104,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
104
  version: '0'
89
105
  requirements: []
90
106
  rubyforge_project:
91
- rubygems_version: 2.5.2
107
+ rubygems_version: 2.7.3
92
108
  signing_key:
93
109
  specification_version: 4
94
110
  summary: Easily extract data from text
95
111
  test_files: []
96
- has_rdoc: