text_extractor 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/lib/text_extractor.rb +32 -2
- data/lib/text_extractor/directives/classes.rb +1 -2
- data/lib/text_extractor/guard.rb +41 -0
- data/lib/text_extractor/record.rb +20 -8
- data/lib/text_extractor/skip.rb +9 -0
- data/lib/text_extractor/value.rb +2 -2
- data/lib/text_extractor/version.rb +1 -1
- metadata +22 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6d814b19dedf7312cee41b86de5ac98115240908b9a5ef17d343eb5450d2d608
|
4
|
+
data.tar.gz: d365e26b0ba0a97d5ec25854767248ceff60f91c0808a24b06cf2fb425b87864
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e39cfa57ff12e5df3011e729627c37948b658c8c82447398c1547b94486b0ef46cd0b4f6491e598375010c3882ca23a2fa5288d407cddaebc88d74996712f9e7
|
7
|
+
data.tar.gz: 1a680465ce5f430100b01ec9b2fdc68f51813b280b03d886454f8c0c61d7cafee72a7097e53c2cedcf09d2cb399ff56931719978207e648cf6ef9d2401844022
|
data/lib/text_extractor.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require_relative 'text_extractor/extraction'
|
2
2
|
require_relative 'text_extractor/filldown'
|
3
|
+
require_relative 'text_extractor/guard'
|
3
4
|
require_relative 'text_extractor/record'
|
5
|
+
require_relative 'text_extractor/skip'
|
4
6
|
require_relative 'text_extractor/value'
|
5
7
|
require_relative 'text_extractor/inline_value'
|
6
8
|
|
@@ -8,6 +10,7 @@ require_relative 'text_extractor/inline_value'
|
|
8
10
|
class TextExtractor
|
9
11
|
attr_reader :records, :values
|
10
12
|
|
13
|
+
# rubocop: disable Metrics/MethodLength
|
11
14
|
def initialize(&block)
|
12
15
|
raise "#{self.class}.new requires a block" unless block
|
13
16
|
@values = {}
|
@@ -18,8 +21,11 @@ class TextExtractor
|
|
18
21
|
@current_record_values = []
|
19
22
|
@section_delimiter = nil
|
20
23
|
@section_terminator = nil
|
24
|
+
@append_guards = []
|
21
25
|
instance_exec(&block)
|
26
|
+
@append_guards.each { |g| guard(**g, &g[:block]) }
|
22
27
|
end
|
28
|
+
# rubocop: enable Metrics/MethodLength
|
23
29
|
|
24
30
|
module Patterns
|
25
31
|
INTEGER = /\d+/
|
@@ -28,8 +34,8 @@ class TextExtractor
|
|
28
34
|
IPV4 = /[0-9.]{7,15}/
|
29
35
|
IPV6 = /[:a-fA-F0-9\.]{2,45}/
|
30
36
|
IPADDR = Regexp.union(IPV4, IPV6)
|
31
|
-
IPV4_NET =
|
32
|
-
IPV6_NET =
|
37
|
+
IPV4_NET = %r{#{IPV4}/\d{1,2}}
|
38
|
+
IPV6_NET = %r{#{IPV6}\/\d{1,3}}
|
33
39
|
IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
|
34
40
|
TRUE = /y|yes|t|true|on/i
|
35
41
|
FALSE = /n|no|f|false|off/i
|
@@ -75,6 +81,7 @@ class TextExtractor
|
|
75
81
|
def record(klass = Record, **kwargs, &block)
|
76
82
|
raise "#{self.class}.record requires a block" unless block
|
77
83
|
kwargs[:extractor_values] = values
|
84
|
+
kwargs[:factory] ||= @factory if @factory
|
78
85
|
kwargs[:values] = @current_record_values = []
|
79
86
|
@records << klass.new(instance_exec(&block), **kwargs)
|
80
87
|
end
|
@@ -84,6 +91,14 @@ class TextExtractor
|
|
84
91
|
@section_terminator = terminator
|
85
92
|
end
|
86
93
|
|
94
|
+
def factory(object = nil)
|
95
|
+
if object
|
96
|
+
@factory = object
|
97
|
+
else
|
98
|
+
@factory
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
87
102
|
def filldown(**kwargs, &block)
|
88
103
|
raise "#{self.class}.filldown requires a block" unless block
|
89
104
|
record(Filldown, **kwargs, &block)
|
@@ -93,6 +108,16 @@ class TextExtractor
|
|
93
108
|
records[records.length.times.find_index { |i| match["__#{i}"] }]
|
94
109
|
end
|
95
110
|
|
111
|
+
def guard(**kwargs, &block)
|
112
|
+
raise "#{self.class}.guard requires a block" unless block
|
113
|
+
record(Guard, **kwargs, &block)
|
114
|
+
end
|
115
|
+
|
116
|
+
def guards(*guard_args)
|
117
|
+
guard_args = Guards::DEFAULT if guard_args.empty?
|
118
|
+
@append_guards = guard_args
|
119
|
+
end
|
120
|
+
|
96
121
|
def scan(input)
|
97
122
|
prefill = {}
|
98
123
|
sections(input).flat_map { |section|
|
@@ -109,6 +134,11 @@ class TextExtractor
|
|
109
134
|
texts.map { |section| section + @section_terminator }
|
110
135
|
end
|
111
136
|
|
137
|
+
def skip(**kwargs, &block)
|
138
|
+
raise "#{self.class}.skip requires a block" unless block
|
139
|
+
record(Skip, **kwargs, &block)
|
140
|
+
end
|
141
|
+
|
112
142
|
def regexps
|
113
143
|
@records.map.with_index do |record, i|
|
114
144
|
Regexp.new("(?<__#{i}>#{record.source})", record.options)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative 'record'
|
2
|
+
|
3
|
+
class TextExtractor
|
4
|
+
class Guard < Record
|
5
|
+
def initialize(_regexp, description:, **kwargs)
|
6
|
+
super
|
7
|
+
@description = description
|
8
|
+
@factory ||= :itself.to_proc
|
9
|
+
end
|
10
|
+
|
11
|
+
def extraction(match, _fill)
|
12
|
+
text = @factory.call(match[0])
|
13
|
+
raise GuardError, "#{@description} near #{text.inspect}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
INDENTED = {
|
18
|
+
description: 'indented line',
|
19
|
+
block: proc {
|
20
|
+
/
|
21
|
+
^[^\n\S]+[^\n]*$
|
22
|
+
/
|
23
|
+
}
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
UNINDENTED = {
|
27
|
+
description: 'unindented line',
|
28
|
+
block: proc {
|
29
|
+
/
|
30
|
+
^\S+[^\n]*$
|
31
|
+
/
|
32
|
+
}
|
33
|
+
}.freeze
|
34
|
+
|
35
|
+
DEFAULT = [
|
36
|
+
INDENTED,
|
37
|
+
UNINDENTED
|
38
|
+
].freeze
|
39
|
+
|
40
|
+
class GuardError < StandardError; end
|
41
|
+
end # class TextExtractor
|
@@ -4,8 +4,18 @@ class TextExtractor
|
|
4
4
|
class Record
|
5
5
|
attr_reader :regexp, :factory, :values
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
# rubocop: disable Metrics/ParameterLists
|
8
|
+
def initialize(
|
9
|
+
regexp,
|
10
|
+
factory: nil,
|
11
|
+
values: [],
|
12
|
+
fill: [],
|
13
|
+
directives: true,
|
14
|
+
inline: [],
|
15
|
+
extractor_values: {},
|
16
|
+
strip: nil,
|
17
|
+
**_kwargs
|
18
|
+
)
|
9
19
|
@factory = factory
|
10
20
|
@constructor = FactoryAnalyzer.new(factory).to_proc
|
11
21
|
@extractor_values = extractor_values
|
@@ -15,12 +25,14 @@ class TextExtractor
|
|
15
25
|
@regexp = build_regexp(regexp, directives, strip)
|
16
26
|
@fill = Array(fill)
|
17
27
|
end
|
28
|
+
# rubocop: enable Metrics/ParameterLists
|
18
29
|
|
30
|
+
# @return Array
|
19
31
|
def extraction(match, fill)
|
20
32
|
extracted = {}.merge!(@default_values)
|
21
|
-
.merge!(extract_fills
|
22
|
-
.merge!(extract_values
|
23
|
-
build_extraction(extracted)
|
33
|
+
.merge!(extract_fills(fill))
|
34
|
+
.merge!(extract_values(match))
|
35
|
+
[build_extraction(extracted)]
|
24
36
|
end
|
25
37
|
|
26
38
|
def build_extraction(extracted)
|
@@ -59,9 +71,9 @@ class TextExtractor
|
|
59
71
|
if directives
|
60
72
|
expander = Directives.new(regexp)
|
61
73
|
expanded = expander.expand
|
62
|
-
expander.values.each
|
74
|
+
expander.values.each do |value|
|
63
75
|
values[value.id] = @extractor_values.fetch(value.id, value)
|
64
|
-
|
76
|
+
end
|
65
77
|
expanded
|
66
78
|
else
|
67
79
|
regexp
|
@@ -126,7 +138,7 @@ class TextExtractor
|
|
126
138
|
def to_proc
|
127
139
|
if @params
|
128
140
|
explicit
|
129
|
-
elsif @klass.
|
141
|
+
elsif @klass.respond_to?(:call)
|
130
142
|
@klass
|
131
143
|
elsif @klass
|
132
144
|
implicit
|
data/lib/text_extractor/value.rb
CHANGED
@@ -10,9 +10,9 @@ class TextExtractor
|
|
10
10
|
|
11
11
|
def convert(value)
|
12
12
|
@block ? @block.call(value) : value
|
13
|
-
rescue => e
|
13
|
+
rescue StandardError => e
|
14
14
|
raise e.class,
|
15
|
-
|
15
|
+
'in custom conversion of '\
|
16
16
|
"value(#{id.inspect}, #{re.inspect}): #{e.message}"
|
17
17
|
end
|
18
18
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: minitest
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '5.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '5.0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rake
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,19 +53,19 @@ dependencies:
|
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '10.0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: rubocop
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
61
|
+
version: '0.54'
|
48
62
|
type: :development
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
68
|
+
version: '0.54'
|
55
69
|
description:
|
56
70
|
email: bjmllr@gmail.com
|
57
71
|
executables: []
|
@@ -64,8 +78,10 @@ files:
|
|
64
78
|
- lib/text_extractor/directives/group.rb
|
65
79
|
- lib/text_extractor/extraction.rb
|
66
80
|
- lib/text_extractor/filldown.rb
|
81
|
+
- lib/text_extractor/guard.rb
|
67
82
|
- lib/text_extractor/inline_value.rb
|
68
83
|
- lib/text_extractor/record.rb
|
84
|
+
- lib/text_extractor/skip.rb
|
69
85
|
- lib/text_extractor/value.rb
|
70
86
|
- lib/text_extractor/version.rb
|
71
87
|
homepage: https://github.com/bjmllr/text_extractor
|
@@ -88,9 +104,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
104
|
version: '0'
|
89
105
|
requirements: []
|
90
106
|
rubyforge_project:
|
91
|
-
rubygems_version: 2.
|
107
|
+
rubygems_version: 2.7.3
|
92
108
|
signing_key:
|
93
109
|
specification_version: 4
|
94
110
|
summary: Easily extract data from text
|
95
111
|
test_files: []
|
96
|
-
has_rdoc:
|