text_extractor 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/text_extractor.rb +8 -4
- data/lib/text_extractor/directives.rb +131 -0
- data/lib/text_extractor/directives/classes.rb +73 -0
- data/lib/text_extractor/directives/group.rb +46 -0
- data/lib/text_extractor/record.rb +57 -31
- data/lib/text_extractor/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 516fd52deaf25b6e67241cd40b55580a43227247
|
4
|
+
data.tar.gz: b776cac3194257f826d8671aef2cf1991a075ae2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28cf87f08c5c04cc2d11c8576692f15919925360e6b5b31459d2319e4b5a3904a7925f87cc03749947a42cb21f8ce0f21759eb98a57376d7115725e6cba288be
|
7
|
+
data.tar.gz: fa79da2fbd314b46ea0343cf7647c23f843a98ad27a0e04b69a665b6f872ab02384bc7d555b2229aaf3207e082c60522f9dfffebcefc1ab596b8ad3d871604ff
|
data/lib/text_extractor.rb
CHANGED
@@ -68,13 +68,17 @@ class TextExtractor
|
|
68
68
|
def strip_record(regexp, strip: nil)
|
69
69
|
lines = regexp.source.split("\n")
|
70
70
|
prefix = lines.last
|
71
|
-
|
72
|
-
|
71
|
+
|
72
|
+
if prefix =~ /\A\s*\z/
|
73
|
+
lines.pop if lines.first =~ /\A\s*\z/
|
74
|
+
lines.shift
|
75
|
+
strip_record_by_line(lines, prefix, strip)
|
76
|
+
end
|
77
|
+
|
78
|
+
Regexp.new(lines.join("\n"), regexp.options)
|
73
79
|
end
|
74
80
|
|
75
81
|
def strip_record_by_line(lines, prefix, strip)
|
76
|
-
return unless prefix =~ /\A\s*\z/
|
77
|
-
|
78
82
|
lines.map! { |s| s.gsub(prefix.to_s, '') }
|
79
83
|
case strip
|
80
84
|
when :left then lines.map! { |s| "\[ \t\r\f]*#{s.lstrip}" }
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
require 'text_extractor/directives/classes'
|
4
|
+
require 'text_extractor/directives/group'
|
5
|
+
|
6
|
+
class TextExtractor
|
7
|
+
def self.expand_directives(re)
|
8
|
+
Directives.new(re).expand
|
9
|
+
end
|
10
|
+
|
11
|
+
# Directives can only be named with lowercase ascii letters (a-z) and _
|
12
|
+
# (underscore).
|
13
|
+
#
|
14
|
+
# Directives can take an argument. An argument can contain any sequence of
|
15
|
+
# characters other than newlines, parenthesis, or dot (.). The argument
|
16
|
+
# appears after the name, in parenthesis, with no whitespace between the name
|
17
|
+
# and left parenthesis. Whitespace inside the parenthesis is taken literally
|
18
|
+
# and not ignored.
|
19
|
+
#
|
20
|
+
# When used, each directive name is preceeded by a dot (.). There should be no
|
21
|
+
# whitespace on either side of the dot. Some directives can be chained one
|
22
|
+
# after another, still using a dot to separate the earlier directive from the
|
23
|
+
# later one.
|
24
|
+
class Directives
|
25
|
+
def initialize(original)
|
26
|
+
@source = original.source
|
27
|
+
@options = original.options
|
28
|
+
end
|
29
|
+
|
30
|
+
def expand
|
31
|
+
return @output if @output
|
32
|
+
@state = State.new
|
33
|
+
scanner = StringScanner.new(@source)
|
34
|
+
read_line(scanner) until scanner.eos?
|
35
|
+
raise 'Unterminated line group' unless @state.groups.empty?
|
36
|
+
@output = Regexp.new(@state.target.join(''), @options)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
DIRECTIVE_MAP = {
|
42
|
+
' ' => { class: Comment, arguments: ->(source) { [source[1..-1]] } },
|
43
|
+
'any' => { class: Any },
|
44
|
+
'begin' => { class: Begin, arguments: :parsed },
|
45
|
+
'end' => { class: End },
|
46
|
+
'maybe' => { class: Maybe },
|
47
|
+
'repeat' => { class: Repeat, arguments: :parse }
|
48
|
+
}.freeze
|
49
|
+
private_constant :DIRECTIVE_MAP
|
50
|
+
|
51
|
+
def read_line(scanner)
|
52
|
+
line = scanner.scan_until(/\n/)
|
53
|
+
|
54
|
+
unless line
|
55
|
+
line = scanner.rest
|
56
|
+
scanner.skip(/.*/)
|
57
|
+
end
|
58
|
+
|
59
|
+
@state.current = @state.current_line = line
|
60
|
+
add_line
|
61
|
+
end
|
62
|
+
|
63
|
+
def add_line
|
64
|
+
apply_directives read_directives
|
65
|
+
return unless @state.current
|
66
|
+
|
67
|
+
if @state.groups.empty?
|
68
|
+
@state.target << @state.current
|
69
|
+
else
|
70
|
+
@state.groups.last << @state.current
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def read_directives
|
75
|
+
md = @state.current_line.match(/(^| )#\./)
|
76
|
+
|
77
|
+
if md
|
78
|
+
@state.current = md.pre_match
|
79
|
+
@state.current += "\n" if @state.newline?
|
80
|
+
parse_directives(md.post_match.rstrip)
|
81
|
+
else
|
82
|
+
[]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def apply_directives(directives)
|
87
|
+
directives.each(&:call)
|
88
|
+
end
|
89
|
+
|
90
|
+
def parse_directives(full_source)
|
91
|
+
return [Comment.new(@state)] if full_source.start_with?(' ')
|
92
|
+
split_directives(full_source)
|
93
|
+
.map { |source| parse_one_directive(source) }
|
94
|
+
end
|
95
|
+
|
96
|
+
def parse_one_directive(source)
|
97
|
+
md = source.match(/^[a-z_]+/)
|
98
|
+
word = md[0]
|
99
|
+
raise "Unknown directive(s) #{source}" unless md
|
100
|
+
map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
|
101
|
+
args = parse_arguments(map[:arguments], md.post_match)
|
102
|
+
map.fetch(:class).new(@state, *args)
|
103
|
+
end
|
104
|
+
|
105
|
+
def split_directives(source)
|
106
|
+
source.split('.')
|
107
|
+
end
|
108
|
+
|
109
|
+
def parse_arguments(rule, source)
|
110
|
+
return [] unless rule
|
111
|
+
return rule.call(source) if rule.is_a?(Proc)
|
112
|
+
source.match(/\(([^)]*)\)/) { |md| md[1] }
|
113
|
+
end
|
114
|
+
end # class Expander
|
115
|
+
|
116
|
+
State = Struct.new(:current, :current_line, :groups, :target) do
|
117
|
+
def initialize(*)
|
118
|
+
super
|
119
|
+
self.groups ||= []
|
120
|
+
self.target ||= []
|
121
|
+
end
|
122
|
+
|
123
|
+
def last_group
|
124
|
+
groups.last
|
125
|
+
end
|
126
|
+
|
127
|
+
def newline?
|
128
|
+
current_line.end_with?("\n")
|
129
|
+
end
|
130
|
+
end # module Directives
|
131
|
+
end # class TextExtractor
|
@@ -0,0 +1,73 @@
|
|
1
|
+
class TextExtractor
|
2
|
+
class Directives
|
3
|
+
# base class for line directives
|
4
|
+
class Directive
|
5
|
+
attr_reader :state
|
6
|
+
|
7
|
+
def initialize(state, argument = nil)
|
8
|
+
@state = state
|
9
|
+
@argument = argument
|
10
|
+
init if respond_to?(:init)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# open a line group
|
15
|
+
class Begin < Directive
|
16
|
+
def init
|
17
|
+
type = case @argument
|
18
|
+
when '', nil
|
19
|
+
'?:'
|
20
|
+
when '?:'
|
21
|
+
''
|
22
|
+
else
|
23
|
+
@argument
|
24
|
+
end
|
25
|
+
@group = group(type)
|
26
|
+
end
|
27
|
+
|
28
|
+
def group(*args)
|
29
|
+
Group.new(*args)
|
30
|
+
end
|
31
|
+
|
32
|
+
def call
|
33
|
+
state.current = nil
|
34
|
+
state.groups.push @group
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# alternating capture group
|
39
|
+
class Any < Begin
|
40
|
+
def group(*args)
|
41
|
+
AnyGroup.new(*args)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# text that will be omitted from the regexp
|
46
|
+
class Comment < Directive
|
47
|
+
def call
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# close a line group
|
52
|
+
class End < Directive
|
53
|
+
def call
|
54
|
+
state.current = state.groups.pop.finish(state.newline?)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# current line or group occurs 0 or 1 times
|
59
|
+
class Maybe < Directive
|
60
|
+
def call
|
61
|
+
state.current = ['(?:', state.current, ')?']
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# repetition
|
66
|
+
class Repeat < Directive
|
67
|
+
def call
|
68
|
+
@argument ||= '0,'
|
69
|
+
state.current = ['(?:', state.current, "){#{@argument}}"]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
class TextExtractor
|
2
|
+
class Directives
|
3
|
+
# a line group
|
4
|
+
class Group
|
5
|
+
def initialize(type, *args)
|
6
|
+
@type = type
|
7
|
+
@lines = args
|
8
|
+
end
|
9
|
+
|
10
|
+
def <<(item)
|
11
|
+
@lines << item
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_a
|
15
|
+
@lines
|
16
|
+
end
|
17
|
+
|
18
|
+
def chomp(newline)
|
19
|
+
return if @lines.empty? || newline
|
20
|
+
tail = @lines[-1]
|
21
|
+
if tail.is_a?(Array)
|
22
|
+
tail = tail[-1] while tail[-1].is_a?(Array)
|
23
|
+
tail[-2] = tail[-2].chomp
|
24
|
+
else
|
25
|
+
@lines[-1] = @lines[-1].chomp
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def finish(newline)
|
30
|
+
chomp(newline)
|
31
|
+
join
|
32
|
+
end
|
33
|
+
|
34
|
+
def join
|
35
|
+
["(#{@type}", *@lines, ')']
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# a line group where each line (or subgroup) is an alternative
|
40
|
+
class AnyGroup < Group
|
41
|
+
def join
|
42
|
+
['(?:', *@lines.flat_map { |e| [e, '|'] }[0..-2], ')']
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -5,6 +5,7 @@ class TextExtractor
|
|
5
5
|
def initialize(regexp, factory: nil, values: [], fill: [])
|
6
6
|
@regexp = regexp
|
7
7
|
@factory = factory
|
8
|
+
@constructor = FactoryAnalyzer.new(factory).to_proc
|
8
9
|
@values = values.map { |val| [val.id, val] }.to_h
|
9
10
|
@default_values = values.map { |val| [val.id, nil] }.to_h
|
10
11
|
@fill = Array(fill)
|
@@ -18,37 +19,8 @@ class TextExtractor
|
|
18
19
|
end
|
19
20
|
|
20
21
|
def build_extraction(extracted)
|
21
|
-
|
22
|
-
|
23
|
-
build_extraction_by_hash(extracted)
|
24
|
-
when Set
|
25
|
-
build_extraction_by_set(extracted)
|
26
|
-
when Class
|
27
|
-
build_extraction_by_class(extracted)
|
28
|
-
else
|
29
|
-
extracted
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def build_extraction_by_hash(extracted)
|
34
|
-
klass, params = factory.first
|
35
|
-
klass.new(*extracted.values_at(*params))
|
36
|
-
end
|
37
|
-
|
38
|
-
def build_extraction_by_set(extracted)
|
39
|
-
klass, params = factory.first
|
40
|
-
values = params.each_with_object({}) do |param, hash|
|
41
|
-
hash[param] = extracted[param]
|
42
|
-
end
|
43
|
-
klass.new(**values)
|
44
|
-
end
|
45
|
-
|
46
|
-
def build_extraction_by_class(extracted)
|
47
|
-
if factory.ancestors.include?(Struct)
|
48
|
-
factory.new(*extracted.values)
|
49
|
-
else
|
50
|
-
factory.new(**extracted)
|
51
|
-
end
|
22
|
+
return extracted unless @constructor
|
23
|
+
@constructor.call(extracted)
|
52
24
|
end
|
53
25
|
|
54
26
|
def match(string, pos = 0)
|
@@ -70,5 +42,59 @@ class TextExtractor
|
|
70
42
|
def extract_values(match)
|
71
43
|
values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
|
72
44
|
end
|
45
|
+
|
46
|
+
# converts the value of the factory option to a constructor proc
|
47
|
+
class FactoryAnalyzer
|
48
|
+
def initialize(factory)
|
49
|
+
case factory
|
50
|
+
when Hash
|
51
|
+
@klass, @params = factory.first
|
52
|
+
else
|
53
|
+
@klass = factory
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_proc
|
58
|
+
if @params
|
59
|
+
explicit
|
60
|
+
elsif @klass.is_a?(Proc)
|
61
|
+
@klass
|
62
|
+
elsif @klass
|
63
|
+
implicit
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def explicit
|
70
|
+
case @params
|
71
|
+
when Array
|
72
|
+
positional
|
73
|
+
when Set
|
74
|
+
keyword
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def positional
|
79
|
+
->(extracted) { @klass.new(*extracted.values_at(*@params)) }
|
80
|
+
end
|
81
|
+
|
82
|
+
def keyword
|
83
|
+
lambda do |extracted|
|
84
|
+
values = @params.each_with_object({}) do |param, hash|
|
85
|
+
hash[param] = extracted[param]
|
86
|
+
end
|
87
|
+
@klass.new(**values)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def implicit
|
92
|
+
if @klass.ancestors.include?(Struct)
|
93
|
+
->(extracted) { @klass.new(*extracted.values) }
|
94
|
+
else
|
95
|
+
->(extracted) { @klass.new(**extracted) }
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end # class FactoryAnalyzer
|
73
99
|
end # class Record
|
74
100
|
end # class TextExtractor
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -59,6 +59,9 @@ extensions: []
|
|
59
59
|
extra_rdoc_files: []
|
60
60
|
files:
|
61
61
|
- lib/text_extractor.rb
|
62
|
+
- lib/text_extractor/directives.rb
|
63
|
+
- lib/text_extractor/directives/classes.rb
|
64
|
+
- lib/text_extractor/directives/group.rb
|
62
65
|
- lib/text_extractor/extraction.rb
|
63
66
|
- lib/text_extractor/filldown.rb
|
64
67
|
- lib/text_extractor/record.rb
|