text_extractor 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/lib/text_extractor.rb +32 -2
- data/lib/text_extractor/directives/classes.rb +1 -2
- data/lib/text_extractor/guard.rb +41 -0
- data/lib/text_extractor/record.rb +20 -8
- data/lib/text_extractor/skip.rb +9 -0
- data/lib/text_extractor/value.rb +2 -2
- data/lib/text_extractor/version.rb +1 -1
- metadata +22 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6d814b19dedf7312cee41b86de5ac98115240908b9a5ef17d343eb5450d2d608
|
4
|
+
data.tar.gz: d365e26b0ba0a97d5ec25854767248ceff60f91c0808a24b06cf2fb425b87864
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e39cfa57ff12e5df3011e729627c37948b658c8c82447398c1547b94486b0ef46cd0b4f6491e598375010c3882ca23a2fa5288d407cddaebc88d74996712f9e7
|
7
|
+
data.tar.gz: 1a680465ce5f430100b01ec9b2fdc68f51813b280b03d886454f8c0c61d7cafee72a7097e53c2cedcf09d2cb399ff56931719978207e648cf6ef9d2401844022
|
data/lib/text_extractor.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require_relative 'text_extractor/extraction'
|
2
2
|
require_relative 'text_extractor/filldown'
|
3
|
+
require_relative 'text_extractor/guard'
|
3
4
|
require_relative 'text_extractor/record'
|
5
|
+
require_relative 'text_extractor/skip'
|
4
6
|
require_relative 'text_extractor/value'
|
5
7
|
require_relative 'text_extractor/inline_value'
|
6
8
|
|
@@ -8,6 +10,7 @@ require_relative 'text_extractor/inline_value'
|
|
8
10
|
class TextExtractor
|
9
11
|
attr_reader :records, :values
|
10
12
|
|
13
|
+
# rubocop: disable Metrics/MethodLength
|
11
14
|
def initialize(&block)
|
12
15
|
raise "#{self.class}.new requires a block" unless block
|
13
16
|
@values = {}
|
@@ -18,8 +21,11 @@ class TextExtractor
|
|
18
21
|
@current_record_values = []
|
19
22
|
@section_delimiter = nil
|
20
23
|
@section_terminator = nil
|
24
|
+
@append_guards = []
|
21
25
|
instance_exec(&block)
|
26
|
+
@append_guards.each { |g| guard(**g, &g[:block]) }
|
22
27
|
end
|
28
|
+
# rubocop: enable Metrics/MethodLength
|
23
29
|
|
24
30
|
module Patterns
|
25
31
|
INTEGER = /\d+/
|
@@ -28,8 +34,8 @@ class TextExtractor
|
|
28
34
|
IPV4 = /[0-9.]{7,15}/
|
29
35
|
IPV6 = /[:a-fA-F0-9\.]{2,45}/
|
30
36
|
IPADDR = Regexp.union(IPV4, IPV6)
|
31
|
-
IPV4_NET =
|
32
|
-
IPV6_NET =
|
37
|
+
IPV4_NET = %r{#{IPV4}/\d{1,2}}
|
38
|
+
IPV6_NET = %r{#{IPV6}\/\d{1,3}}
|
33
39
|
IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
|
34
40
|
TRUE = /y|yes|t|true|on/i
|
35
41
|
FALSE = /n|no|f|false|off/i
|
@@ -75,6 +81,7 @@ class TextExtractor
|
|
75
81
|
def record(klass = Record, **kwargs, &block)
|
76
82
|
raise "#{self.class}.record requires a block" unless block
|
77
83
|
kwargs[:extractor_values] = values
|
84
|
+
kwargs[:factory] ||= @factory if @factory
|
78
85
|
kwargs[:values] = @current_record_values = []
|
79
86
|
@records << klass.new(instance_exec(&block), **kwargs)
|
80
87
|
end
|
@@ -84,6 +91,14 @@ class TextExtractor
|
|
84
91
|
@section_terminator = terminator
|
85
92
|
end
|
86
93
|
|
94
|
+
def factory(object = nil)
|
95
|
+
if object
|
96
|
+
@factory = object
|
97
|
+
else
|
98
|
+
@factory
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
87
102
|
def filldown(**kwargs, &block)
|
88
103
|
raise "#{self.class}.filldown requires a block" unless block
|
89
104
|
record(Filldown, **kwargs, &block)
|
@@ -93,6 +108,16 @@ class TextExtractor
|
|
93
108
|
records[records.length.times.find_index { |i| match["__#{i}"] }]
|
94
109
|
end
|
95
110
|
|
111
|
+
def guard(**kwargs, &block)
|
112
|
+
raise "#{self.class}.guard requires a block" unless block
|
113
|
+
record(Guard, **kwargs, &block)
|
114
|
+
end
|
115
|
+
|
116
|
+
def guards(*guard_args)
|
117
|
+
guard_args = Guards::DEFAULT if guard_args.empty?
|
118
|
+
@append_guards = guard_args
|
119
|
+
end
|
120
|
+
|
96
121
|
def scan(input)
|
97
122
|
prefill = {}
|
98
123
|
sections(input).flat_map { |section|
|
@@ -109,6 +134,11 @@ class TextExtractor
|
|
109
134
|
texts.map { |section| section + @section_terminator }
|
110
135
|
end
|
111
136
|
|
137
|
+
def skip(**kwargs, &block)
|
138
|
+
raise "#{self.class}.skip requires a block" unless block
|
139
|
+
record(Skip, **kwargs, &block)
|
140
|
+
end
|
141
|
+
|
112
142
|
def regexps
|
113
143
|
@records.map.with_index do |record, i|
|
114
144
|
Regexp.new("(?<__#{i}>#{record.source})", record.options)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require_relative 'record'
|
2
|
+
|
3
|
+
class TextExtractor
|
4
|
+
class Guard < Record
|
5
|
+
def initialize(_regexp, description:, **kwargs)
|
6
|
+
super
|
7
|
+
@description = description
|
8
|
+
@factory ||= :itself.to_proc
|
9
|
+
end
|
10
|
+
|
11
|
+
def extraction(match, _fill)
|
12
|
+
text = @factory.call(match[0])
|
13
|
+
raise GuardError, "#{@description} near #{text.inspect}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
INDENTED = {
|
18
|
+
description: 'indented line',
|
19
|
+
block: proc {
|
20
|
+
/
|
21
|
+
^[^\n\S]+[^\n]*$
|
22
|
+
/
|
23
|
+
}
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
UNINDENTED = {
|
27
|
+
description: 'unindented line',
|
28
|
+
block: proc {
|
29
|
+
/
|
30
|
+
^\S+[^\n]*$
|
31
|
+
/
|
32
|
+
}
|
33
|
+
}.freeze
|
34
|
+
|
35
|
+
DEFAULT = [
|
36
|
+
INDENTED,
|
37
|
+
UNINDENTED
|
38
|
+
].freeze
|
39
|
+
|
40
|
+
class GuardError < StandardError; end
|
41
|
+
end # class TextExtractor
|
@@ -4,8 +4,18 @@ class TextExtractor
|
|
4
4
|
class Record
|
5
5
|
attr_reader :regexp, :factory, :values
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
# rubocop: disable Metrics/ParameterLists
|
8
|
+
def initialize(
|
9
|
+
regexp,
|
10
|
+
factory: nil,
|
11
|
+
values: [],
|
12
|
+
fill: [],
|
13
|
+
directives: true,
|
14
|
+
inline: [],
|
15
|
+
extractor_values: {},
|
16
|
+
strip: nil,
|
17
|
+
**_kwargs
|
18
|
+
)
|
9
19
|
@factory = factory
|
10
20
|
@constructor = FactoryAnalyzer.new(factory).to_proc
|
11
21
|
@extractor_values = extractor_values
|
@@ -15,12 +25,14 @@ class TextExtractor
|
|
15
25
|
@regexp = build_regexp(regexp, directives, strip)
|
16
26
|
@fill = Array(fill)
|
17
27
|
end
|
28
|
+
# rubocop: enable Metrics/ParameterLists
|
18
29
|
|
30
|
+
# @return Array
|
19
31
|
def extraction(match, fill)
|
20
32
|
extracted = {}.merge!(@default_values)
|
21
|
-
.merge!(extract_fills
|
22
|
-
.merge!(extract_values
|
23
|
-
build_extraction(extracted)
|
33
|
+
.merge!(extract_fills(fill))
|
34
|
+
.merge!(extract_values(match))
|
35
|
+
[build_extraction(extracted)]
|
24
36
|
end
|
25
37
|
|
26
38
|
def build_extraction(extracted)
|
@@ -59,9 +71,9 @@ class TextExtractor
|
|
59
71
|
if directives
|
60
72
|
expander = Directives.new(regexp)
|
61
73
|
expanded = expander.expand
|
62
|
-
expander.values.each
|
74
|
+
expander.values.each do |value|
|
63
75
|
values[value.id] = @extractor_values.fetch(value.id, value)
|
64
|
-
|
76
|
+
end
|
65
77
|
expanded
|
66
78
|
else
|
67
79
|
regexp
|
@@ -126,7 +138,7 @@ class TextExtractor
|
|
126
138
|
def to_proc
|
127
139
|
if @params
|
128
140
|
explicit
|
129
|
-
elsif @klass.
|
141
|
+
elsif @klass.respond_to?(:call)
|
130
142
|
@klass
|
131
143
|
elsif @klass
|
132
144
|
implicit
|
data/lib/text_extractor/value.rb
CHANGED
@@ -10,9 +10,9 @@ class TextExtractor
|
|
10
10
|
|
11
11
|
def convert(value)
|
12
12
|
@block ? @block.call(value) : value
|
13
|
-
rescue => e
|
13
|
+
rescue StandardError => e
|
14
14
|
raise e.class,
|
15
|
-
|
15
|
+
'in custom conversion of '\
|
16
16
|
"value(#{id.inspect}, #{re.inspect}): #{e.message}"
|
17
17
|
end
|
18
18
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-08-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: minitest
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '5.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '5.0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rake
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,19 +53,19 @@ dependencies:
|
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '10.0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: rubocop
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - "~>"
|
46
60
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
61
|
+
version: '0.54'
|
48
62
|
type: :development
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
66
|
- - "~>"
|
53
67
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
68
|
+
version: '0.54'
|
55
69
|
description:
|
56
70
|
email: bjmllr@gmail.com
|
57
71
|
executables: []
|
@@ -64,8 +78,10 @@ files:
|
|
64
78
|
- lib/text_extractor/directives/group.rb
|
65
79
|
- lib/text_extractor/extraction.rb
|
66
80
|
- lib/text_extractor/filldown.rb
|
81
|
+
- lib/text_extractor/guard.rb
|
67
82
|
- lib/text_extractor/inline_value.rb
|
68
83
|
- lib/text_extractor/record.rb
|
84
|
+
- lib/text_extractor/skip.rb
|
69
85
|
- lib/text_extractor/value.rb
|
70
86
|
- lib/text_extractor/version.rb
|
71
87
|
homepage: https://github.com/bjmllr/text_extractor
|
@@ -88,9 +104,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
104
|
version: '0'
|
89
105
|
requirements: []
|
90
106
|
rubyforge_project:
|
91
|
-
rubygems_version: 2.
|
107
|
+
rubygems_version: 2.7.3
|
92
108
|
signing_key:
|
93
109
|
specification_version: 4
|
94
110
|
summary: Easily extract data from text
|
95
111
|
test_files: []
|
96
|
-
has_rdoc:
|