text_extractor 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 5412586591653945efffad2ce8783c9269e58582
4
- data.tar.gz: dc9fdb2f1eaad7bfafab31ad13e3e5eff966ea22
2
+ SHA256:
3
+ metadata.gz: 6d814b19dedf7312cee41b86de5ac98115240908b9a5ef17d343eb5450d2d608
4
+ data.tar.gz: d365e26b0ba0a97d5ec25854767248ceff60f91c0808a24b06cf2fb425b87864
5
5
  SHA512:
6
- metadata.gz: f8c4a71b402c49c136700eab64daeba10a48938285d1d06e1522bb8bbd47fe1cf5f6962f0d2dcc555fa413164185e50eed069a52ac6bc4b253b1dd973529d09b
7
- data.tar.gz: 73117018ffc3542a71aa201ac84ddc5d038881605e3a3fdccabce1d0d65a77137f2b731cac6a4463c8e3902c431fa418e166bcefe0cd6feedb899616a75a9e29
6
+ metadata.gz: e39cfa57ff12e5df3011e729627c37948b658c8c82447398c1547b94486b0ef46cd0b4f6491e598375010c3882ca23a2fa5288d407cddaebc88d74996712f9e7
7
+ data.tar.gz: 1a680465ce5f430100b01ec9b2fdc68f51813b280b03d886454f8c0c61d7cafee72a7097e53c2cedcf09d2cb399ff56931719978207e648cf6ef9d2401844022
@@ -1,6 +1,8 @@
1
1
  require_relative 'text_extractor/extraction'
2
2
  require_relative 'text_extractor/filldown'
3
+ require_relative 'text_extractor/guard'
3
4
  require_relative 'text_extractor/record'
5
+ require_relative 'text_extractor/skip'
4
6
  require_relative 'text_extractor/value'
5
7
  require_relative 'text_extractor/inline_value'
6
8
 
@@ -8,6 +10,7 @@ require_relative 'text_extractor/inline_value'
8
10
  class TextExtractor
9
11
  attr_reader :records, :values
10
12
 
13
+ # rubocop: disable Metrics/MethodLength
11
14
  def initialize(&block)
12
15
  raise "#{self.class}.new requires a block" unless block
13
16
  @values = {}
@@ -18,8 +21,11 @@ class TextExtractor
18
21
  @current_record_values = []
19
22
  @section_delimiter = nil
20
23
  @section_terminator = nil
24
+ @append_guards = []
21
25
  instance_exec(&block)
26
+ @append_guards.each { |g| guard(**g, &g[:block]) }
22
27
  end
28
+ # rubocop: enable Metrics/MethodLength
23
29
 
24
30
  module Patterns
25
31
  INTEGER = /\d+/
@@ -28,8 +34,8 @@ class TextExtractor
28
34
  IPV4 = /[0-9.]{7,15}/
29
35
  IPV6 = /[:a-fA-F0-9\.]{2,45}/
30
36
  IPADDR = Regexp.union(IPV4, IPV6)
31
- IPV4_NET = /#{IPV4}\/\d{1,2}/
32
- IPV6_NET = /#{IPV6}\/\d{1,3}/
37
+ IPV4_NET = %r{#{IPV4}/\d{1,2}}
38
+ IPV6_NET = %r{#{IPV6}\/\d{1,3}}
33
39
  IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
34
40
  TRUE = /y|yes|t|true|on/i
35
41
  FALSE = /n|no|f|false|off/i
@@ -75,6 +81,7 @@ class TextExtractor
75
81
  def record(klass = Record, **kwargs, &block)
76
82
  raise "#{self.class}.record requires a block" unless block
77
83
  kwargs[:extractor_values] = values
84
+ kwargs[:factory] ||= @factory if @factory
78
85
  kwargs[:values] = @current_record_values = []
79
86
  @records << klass.new(instance_exec(&block), **kwargs)
80
87
  end
@@ -84,6 +91,14 @@ class TextExtractor
84
91
  @section_terminator = terminator
85
92
  end
86
93
 
94
+ def factory(object = nil)
95
+ if object
96
+ @factory = object
97
+ else
98
+ @factory
99
+ end
100
+ end
101
+
87
102
  def filldown(**kwargs, &block)
88
103
  raise "#{self.class}.filldown requires a block" unless block
89
104
  record(Filldown, **kwargs, &block)
@@ -93,6 +108,16 @@ class TextExtractor
93
108
  records[records.length.times.find_index { |i| match["__#{i}"] }]
94
109
  end
95
110
 
111
+ def guard(**kwargs, &block)
112
+ raise "#{self.class}.guard requires a block" unless block
113
+ record(Guard, **kwargs, &block)
114
+ end
115
+
116
+ def guards(*guard_args)
117
+ guard_args = Guards::DEFAULT if guard_args.empty?
118
+ @append_guards = guard_args
119
+ end
120
+
96
121
  def scan(input)
97
122
  prefill = {}
98
123
  sections(input).flat_map { |section|
@@ -109,6 +134,11 @@ class TextExtractor
109
134
  texts.map { |section| section + @section_terminator }
110
135
  end
111
136
 
137
+ def skip(**kwargs, &block)
138
+ raise "#{self.class}.skip requires a block" unless block
139
+ record(Skip, **kwargs, &block)
140
+ end
141
+
112
142
  def regexps
113
143
  @records.map.with_index do |record, i|
114
144
  Regexp.new("(?<__#{i}>#{record.source})", record.options)
@@ -61,8 +61,7 @@ class TextExtractor
61
61
 
62
62
  # text that will be omitted from the regexp
63
63
  class Comment < Directive
64
- def call
65
- end
64
+ def call; end
66
65
  end
67
66
 
68
67
  # close a line group
@@ -0,0 +1,41 @@
1
+ require_relative 'record'
2
+
3
+ class TextExtractor
4
+ class Guard < Record
5
+ def initialize(_regexp, description:, **kwargs)
6
+ super
7
+ @description = description
8
+ @factory ||= :itself.to_proc
9
+ end
10
+
11
+ def extraction(match, _fill)
12
+ text = @factory.call(match[0])
13
+ raise GuardError, "#{@description} near #{text.inspect}"
14
+ end
15
+ end
16
+
17
+ INDENTED = {
18
+ description: 'indented line',
19
+ block: proc {
20
+ /
21
+ ^[^\n\S]+[^\n]*$
22
+ /
23
+ }
24
+ }.freeze
25
+
26
+ UNINDENTED = {
27
+ description: 'unindented line',
28
+ block: proc {
29
+ /
30
+ ^\S+[^\n]*$
31
+ /
32
+ }
33
+ }.freeze
34
+
35
+ DEFAULT = [
36
+ INDENTED,
37
+ UNINDENTED
38
+ ].freeze
39
+
40
+ class GuardError < StandardError; end
41
+ end # class TextExtractor
@@ -4,8 +4,18 @@ class TextExtractor
4
4
  class Record
5
5
  attr_reader :regexp, :factory, :values
6
6
 
7
- def initialize(regexp, factory: nil, values: [], fill: [], directives: true,
8
- inline: [], extractor_values: {}, strip: nil)
7
+ # rubocop: disable Metrics/ParameterLists
8
+ def initialize(
9
+ regexp,
10
+ factory: nil,
11
+ values: [],
12
+ fill: [],
13
+ directives: true,
14
+ inline: [],
15
+ extractor_values: {},
16
+ strip: nil,
17
+ **_kwargs
18
+ )
9
19
  @factory = factory
10
20
  @constructor = FactoryAnalyzer.new(factory).to_proc
11
21
  @extractor_values = extractor_values
@@ -15,12 +25,14 @@ class TextExtractor
15
25
  @regexp = build_regexp(regexp, directives, strip)
16
26
  @fill = Array(fill)
17
27
  end
28
+ # rubocop: enable Metrics/ParameterLists
18
29
 
30
+ # @return Array
19
31
  def extraction(match, fill)
20
32
  extracted = {}.merge!(@default_values)
21
- .merge!(extract_fills fill)
22
- .merge!(extract_values match)
23
- build_extraction(extracted)
33
+ .merge!(extract_fills(fill))
34
+ .merge!(extract_values(match))
35
+ [build_extraction(extracted)]
24
36
  end
25
37
 
26
38
  def build_extraction(extracted)
@@ -59,9 +71,9 @@ class TextExtractor
59
71
  if directives
60
72
  expander = Directives.new(regexp)
61
73
  expanded = expander.expand
62
- expander.values.each { |value|
74
+ expander.values.each do |value|
63
75
  values[value.id] = @extractor_values.fetch(value.id, value)
64
- }
76
+ end
65
77
  expanded
66
78
  else
67
79
  regexp
@@ -126,7 +138,7 @@ class TextExtractor
126
138
  def to_proc
127
139
  if @params
128
140
  explicit
129
- elsif @klass.is_a?(Proc)
141
+ elsif @klass.respond_to?(:call)
130
142
  @klass
131
143
  elsif @klass
132
144
  implicit
@@ -0,0 +1,9 @@
1
+ require_relative 'record'
2
+
3
+ class TextExtractor
4
+ class Skip < Record
5
+ def extraction(*)
6
+ []
7
+ end
8
+ end # class Skip < Record
9
+ end # class TextExtractor
@@ -10,9 +10,9 @@ class TextExtractor
10
10
 
11
11
  def convert(value)
12
12
  @block ? @block.call(value) : value
13
- rescue => e
13
+ rescue StandardError => e
14
14
  raise e.class,
15
- "in custom conversion of "\
15
+ 'in custom conversion of '\
16
16
  "value(#{id.inspect}, #{re.inspect}): #{e.message}"
17
17
  end
18
18
  end
@@ -1,5 +1,5 @@
1
1
  class TextExtractor
2
2
  def self.version
3
- '0.4.0'
3
+ '0.5.0'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Miller
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-11-28 00:00:00.000000000 Z
11
+ date: 2018-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: minitest
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '5.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '5.0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: rake
29
43
  requirement: !ruby/object:Gem::Requirement
@@ -39,19 +53,19 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '10.0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: minitest
56
+ name: rubocop
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - "~>"
46
60
  - !ruby/object:Gem::Version
47
- version: '5.0'
61
+ version: '0.54'
48
62
  type: :development
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
66
  - - "~>"
53
67
  - !ruby/object:Gem::Version
54
- version: '5.0'
68
+ version: '0.54'
55
69
  description:
56
70
  email: bjmllr@gmail.com
57
71
  executables: []
@@ -64,8 +78,10 @@ files:
64
78
  - lib/text_extractor/directives/group.rb
65
79
  - lib/text_extractor/extraction.rb
66
80
  - lib/text_extractor/filldown.rb
81
+ - lib/text_extractor/guard.rb
67
82
  - lib/text_extractor/inline_value.rb
68
83
  - lib/text_extractor/record.rb
84
+ - lib/text_extractor/skip.rb
69
85
  - lib/text_extractor/value.rb
70
86
  - lib/text_extractor/version.rb
71
87
  homepage: https://github.com/bjmllr/text_extractor
@@ -88,9 +104,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
88
104
  version: '0'
89
105
  requirements: []
90
106
  rubyforge_project:
91
- rubygems_version: 2.5.2
107
+ rubygems_version: 2.7.3
92
108
  signing_key:
93
109
  specification_version: 4
94
110
  summary: Easily extract data from text
95
111
  test_files: []
96
- has_rdoc: