text_extractor 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_extractor.rb +6 -0
- data/lib/text_extractor/directives.rb +8 -0
- data/lib/text_extractor/directives/classes.rb +17 -0
- data/lib/text_extractor/directives/group.rb +12 -0
- data/lib/text_extractor/inline_value.rb +13 -0
- data/lib/text_extractor/record.rb +19 -3
- data/lib/text_extractor/value.rb +1 -1
- data/lib/text_extractor/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c42725b840531e241a7353991f7fec182561a89c
|
4
|
+
data.tar.gz: 07b7639b33d4d7b9380e9674c4a3f6531c3b03ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9cadfc1dee9915d1b0b259de9b006154486aaba50808246bff28af16630ad6f481e8fba20289d1f60b315f19d98df44320df2af5e33b58096faaa6596163fa54
|
7
|
+
data.tar.gz: 9c3ee7b8a460612908a24bc64cc765aea243da2d2e9462b16630db1402fd6b7f75e2877be55341b968ac708d1b9b06c04b6424f50ecc5b5bafce373c7f5a049d
|
data/lib/text_extractor.rb
CHANGED
@@ -2,6 +2,7 @@ require_relative 'text_extractor/extraction'
|
|
2
2
|
require_relative 'text_extractor/filldown'
|
3
3
|
require_relative 'text_extractor/record'
|
4
4
|
require_relative 'text_extractor/value'
|
5
|
+
require_relative 'text_extractor/inline_value'
|
5
6
|
|
6
7
|
# represents an extractor definition
|
7
8
|
class TextExtractor
|
@@ -41,6 +42,10 @@ class TextExtractor
|
|
41
42
|
end
|
42
43
|
end
|
43
44
|
|
45
|
+
def inline(id, &block)
|
46
|
+
@values[id] = InlineValue.new(id, &block)
|
47
|
+
end
|
48
|
+
|
44
49
|
def boolean(id, re = Patterns::BOOLEAN)
|
45
50
|
value(id, re) { |val| !val.match(Patterns::FALSE) }
|
46
51
|
end
|
@@ -67,6 +72,7 @@ class TextExtractor
|
|
67
72
|
|
68
73
|
def record(klass = Record, **kwargs, &block)
|
69
74
|
raise "#{self.class}.record requires a block" unless block
|
75
|
+
kwargs[:extractor_values] = values
|
70
76
|
kwargs[:values] = @current_record_values = []
|
71
77
|
@records << klass.new(instance_exec(&block), **kwargs)
|
72
78
|
end
|
@@ -25,6 +25,8 @@ class TextExtractor
|
|
25
25
|
def initialize(original)
|
26
26
|
@source = original.source
|
27
27
|
@options = original.options
|
28
|
+
@output = nil
|
29
|
+
@directives = []
|
28
30
|
end
|
29
31
|
|
30
32
|
def expand
|
@@ -36,12 +38,17 @@ class TextExtractor
|
|
36
38
|
@output = Regexp.new(@state.target.join(''), @options)
|
37
39
|
end
|
38
40
|
|
41
|
+
def values
|
42
|
+
@directives.flat_map(&:values)
|
43
|
+
end
|
44
|
+
|
39
45
|
private
|
40
46
|
|
41
47
|
DIRECTIVE_MAP = {
|
42
48
|
' ' => { class: Comment },
|
43
49
|
'any' => { class: Any },
|
44
50
|
'begin' => { class: Begin, arguments: :parsed },
|
51
|
+
'capture' => { class: Capture, arguments: :parsed },
|
45
52
|
'end' => { class: End },
|
46
53
|
'maybe' => { class: Maybe },
|
47
54
|
'repeat' => { class: Repeat, arguments: :parse },
|
@@ -92,6 +99,7 @@ class TextExtractor
|
|
92
99
|
return [Comment.new(@state)] if full_source.start_with?(' ')
|
93
100
|
split_directives(full_source)
|
94
101
|
.map { |source| parse_one_directive(source) }
|
102
|
+
.each { |directive| @directives << directive }
|
95
103
|
end
|
96
104
|
|
97
105
|
def parse_one_directive(source)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'text_extractor/inline_value'
|
2
|
+
|
1
3
|
class TextExtractor
|
2
4
|
class Directives
|
3
5
|
# base class for line directives
|
@@ -9,6 +11,10 @@ class TextExtractor
|
|
9
11
|
@argument = argument
|
10
12
|
init if respond_to?(:init)
|
11
13
|
end
|
14
|
+
|
15
|
+
def values
|
16
|
+
[]
|
17
|
+
end
|
12
18
|
end
|
13
19
|
|
14
20
|
# open a line group
|
@@ -42,6 +48,17 @@ class TextExtractor
|
|
42
48
|
end
|
43
49
|
end
|
44
50
|
|
51
|
+
# capture group that creates a value
|
52
|
+
class Capture < Begin
|
53
|
+
def group(name, *args)
|
54
|
+
CaptureGroup.new(name, *args)
|
55
|
+
end
|
56
|
+
|
57
|
+
def values
|
58
|
+
[InlineValue.new(@argument.to_sym)]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
45
62
|
# text that will be omitted from the regexp
|
46
63
|
class Comment < Directive
|
47
64
|
def call
|
@@ -42,5 +42,17 @@ class TextExtractor
|
|
42
42
|
['(?:', *@lines.flat_map { |e| [e, '|'] }[0..-2], ')']
|
43
43
|
end
|
44
44
|
end
|
45
|
+
|
46
|
+
# a line group that will be captured to a value
|
47
|
+
class CaptureGroup < Group
|
48
|
+
def initialize(name, *args)
|
49
|
+
@name = name
|
50
|
+
@lines = args
|
51
|
+
end
|
52
|
+
|
53
|
+
def join
|
54
|
+
["(?<#{@name}>", *@lines, ')']
|
55
|
+
end
|
56
|
+
end
|
45
57
|
end
|
46
58
|
end
|
@@ -5,12 +5,14 @@ class TextExtractor
|
|
5
5
|
attr_reader :regexp, :factory, :values
|
6
6
|
|
7
7
|
def initialize(regexp, factory: nil, values: [], fill: [], directives: true,
|
8
|
-
strip: nil)
|
9
|
-
@regexp = build_regexp(regexp, directives, strip)
|
8
|
+
inline: [], extractor_values: {}, strip: nil)
|
10
9
|
@factory = factory
|
11
10
|
@constructor = FactoryAnalyzer.new(factory).to_proc
|
11
|
+
@extractor_values = extractor_values
|
12
12
|
@values = values.map { |val| [val.id, val] }.to_h
|
13
|
+
initialize_inline_values(inline)
|
13
14
|
@default_values = values.map { |val| [val.id, nil] }.to_h
|
15
|
+
@regexp = build_regexp(regexp, directives, strip)
|
14
16
|
@fill = Array(fill)
|
15
17
|
end
|
16
18
|
|
@@ -55,7 +57,12 @@ class TextExtractor
|
|
55
57
|
|
56
58
|
def expand_regexp(regexp, directives)
|
57
59
|
if directives
|
58
|
-
|
60
|
+
expander = Directives.new(regexp)
|
61
|
+
expanded = expander.expand
|
62
|
+
expander.values.each { |value|
|
63
|
+
values[value.id] = @extractor_values.fetch(value.id, value)
|
64
|
+
}
|
65
|
+
expanded
|
59
66
|
else
|
60
67
|
regexp
|
61
68
|
end
|
@@ -96,9 +103,18 @@ class TextExtractor
|
|
96
103
|
values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
|
97
104
|
end
|
98
105
|
|
106
|
+
def initialize_inline_values(inline_values)
|
107
|
+
inline_values.each do |value|
|
108
|
+
@values[value] = @extractor_values
|
109
|
+
.fetch(value) { InlineValue.new(value) }
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
99
113
|
# converts the value of the factory option to a constructor proc
|
100
114
|
class FactoryAnalyzer
|
101
115
|
def initialize(factory)
|
116
|
+
@params = nil
|
117
|
+
|
102
118
|
case factory
|
103
119
|
when Hash
|
104
120
|
@klass, @params = factory.first
|
data/lib/text_extractor/value.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-08-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -64,6 +64,7 @@ files:
|
|
64
64
|
- lib/text_extractor/directives/group.rb
|
65
65
|
- lib/text_extractor/extraction.rb
|
66
66
|
- lib/text_extractor/filldown.rb
|
67
|
+
- lib/text_extractor/inline_value.rb
|
67
68
|
- lib/text_extractor/record.rb
|
68
69
|
- lib/text_extractor/value.rb
|
69
70
|
- lib/text_extractor/version.rb
|
@@ -92,4 +93,3 @@ signing_key:
|
|
92
93
|
specification_version: 4
|
93
94
|
summary: Easily extract data from text
|
94
95
|
test_files: []
|
95
|
-
has_rdoc:
|