text_extractor 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/text_extractor.rb +8 -4
- data/lib/text_extractor/directives.rb +131 -0
- data/lib/text_extractor/directives/classes.rb +73 -0
- data/lib/text_extractor/directives/group.rb +46 -0
- data/lib/text_extractor/record.rb +57 -31
- data/lib/text_extractor/version.rb +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 516fd52deaf25b6e67241cd40b55580a43227247
|
4
|
+
data.tar.gz: b776cac3194257f826d8671aef2cf1991a075ae2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 28cf87f08c5c04cc2d11c8576692f15919925360e6b5b31459d2319e4b5a3904a7925f87cc03749947a42cb21f8ce0f21759eb98a57376d7115725e6cba288be
|
7
|
+
data.tar.gz: fa79da2fbd314b46ea0343cf7647c23f843a98ad27a0e04b69a665b6f872ab02384bc7d555b2229aaf3207e082c60522f9dfffebcefc1ab596b8ad3d871604ff
|
data/lib/text_extractor.rb
CHANGED
@@ -68,13 +68,17 @@ class TextExtractor
|
|
68
68
|
def strip_record(regexp, strip: nil)
|
69
69
|
lines = regexp.source.split("\n")
|
70
70
|
prefix = lines.last
|
71
|
-
|
72
|
-
|
71
|
+
|
72
|
+
if prefix =~ /\A\s*\z/
|
73
|
+
lines.pop if lines.first =~ /\A\s*\z/
|
74
|
+
lines.shift
|
75
|
+
strip_record_by_line(lines, prefix, strip)
|
76
|
+
end
|
77
|
+
|
78
|
+
Regexp.new(lines.join("\n"), regexp.options)
|
73
79
|
end
|
74
80
|
|
75
81
|
def strip_record_by_line(lines, prefix, strip)
|
76
|
-
return unless prefix =~ /\A\s*\z/
|
77
|
-
|
78
82
|
lines.map! { |s| s.gsub(prefix.to_s, '') }
|
79
83
|
case strip
|
80
84
|
when :left then lines.map! { |s| "\[ \t\r\f]*#{s.lstrip}" }
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
require 'text_extractor/directives/classes'
|
4
|
+
require 'text_extractor/directives/group'
|
5
|
+
|
6
|
+
class TextExtractor
|
7
|
+
def self.expand_directives(re)
|
8
|
+
Directives.new(re).expand
|
9
|
+
end
|
10
|
+
|
11
|
+
# Directives can only be named with lowercase ascii letters (a-z) and _
|
12
|
+
# (underscore).
|
13
|
+
#
|
14
|
+
# Directives can take an argument. An argument can contain any sequence of
|
15
|
+
# characters other than newlines, parenthesis, or dot (.). The argument
|
16
|
+
# appears after the name, in parenthesis, with no whitespace between the name
|
17
|
+
# and left parenthesis. Whitespace inside the parenthesis is taken literally
|
18
|
+
# and not ignored.
|
19
|
+
#
|
20
|
+
# When used, each directive name is preceeded by a dot (.). There should be no
|
21
|
+
# whitespace on either side of the dot. Some directives can be chained one
|
22
|
+
# after another, still using a dot to separate the earlier directive from the
|
23
|
+
# later one.
|
24
|
+
class Directives
|
25
|
+
def initialize(original)
|
26
|
+
@source = original.source
|
27
|
+
@options = original.options
|
28
|
+
end
|
29
|
+
|
30
|
+
def expand
|
31
|
+
return @output if @output
|
32
|
+
@state = State.new
|
33
|
+
scanner = StringScanner.new(@source)
|
34
|
+
read_line(scanner) until scanner.eos?
|
35
|
+
raise 'Unterminated line group' unless @state.groups.empty?
|
36
|
+
@output = Regexp.new(@state.target.join(''), @options)
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
DIRECTIVE_MAP = {
|
42
|
+
' ' => { class: Comment, arguments: ->(source) { [source[1..-1]] } },
|
43
|
+
'any' => { class: Any },
|
44
|
+
'begin' => { class: Begin, arguments: :parsed },
|
45
|
+
'end' => { class: End },
|
46
|
+
'maybe' => { class: Maybe },
|
47
|
+
'repeat' => { class: Repeat, arguments: :parse }
|
48
|
+
}.freeze
|
49
|
+
private_constant :DIRECTIVE_MAP
|
50
|
+
|
51
|
+
def read_line(scanner)
|
52
|
+
line = scanner.scan_until(/\n/)
|
53
|
+
|
54
|
+
unless line
|
55
|
+
line = scanner.rest
|
56
|
+
scanner.skip(/.*/)
|
57
|
+
end
|
58
|
+
|
59
|
+
@state.current = @state.current_line = line
|
60
|
+
add_line
|
61
|
+
end
|
62
|
+
|
63
|
+
def add_line
|
64
|
+
apply_directives read_directives
|
65
|
+
return unless @state.current
|
66
|
+
|
67
|
+
if @state.groups.empty?
|
68
|
+
@state.target << @state.current
|
69
|
+
else
|
70
|
+
@state.groups.last << @state.current
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def read_directives
|
75
|
+
md = @state.current_line.match(/(^| )#\./)
|
76
|
+
|
77
|
+
if md
|
78
|
+
@state.current = md.pre_match
|
79
|
+
@state.current += "\n" if @state.newline?
|
80
|
+
parse_directives(md.post_match.rstrip)
|
81
|
+
else
|
82
|
+
[]
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def apply_directives(directives)
|
87
|
+
directives.each(&:call)
|
88
|
+
end
|
89
|
+
|
90
|
+
def parse_directives(full_source)
|
91
|
+
return [Comment.new(@state)] if full_source.start_with?(' ')
|
92
|
+
split_directives(full_source)
|
93
|
+
.map { |source| parse_one_directive(source) }
|
94
|
+
end
|
95
|
+
|
96
|
+
def parse_one_directive(source)
|
97
|
+
md = source.match(/^[a-z_]+/)
|
98
|
+
word = md[0]
|
99
|
+
raise "Unknown directive(s) #{source}" unless md
|
100
|
+
map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
|
101
|
+
args = parse_arguments(map[:arguments], md.post_match)
|
102
|
+
map.fetch(:class).new(@state, *args)
|
103
|
+
end
|
104
|
+
|
105
|
+
def split_directives(source)
|
106
|
+
source.split('.')
|
107
|
+
end
|
108
|
+
|
109
|
+
def parse_arguments(rule, source)
|
110
|
+
return [] unless rule
|
111
|
+
return rule.call(source) if rule.is_a?(Proc)
|
112
|
+
source.match(/\(([^)]*)\)/) { |md| md[1] }
|
113
|
+
end
|
114
|
+
end # class Expander
|
115
|
+
|
116
|
+
State = Struct.new(:current, :current_line, :groups, :target) do
|
117
|
+
def initialize(*)
|
118
|
+
super
|
119
|
+
self.groups ||= []
|
120
|
+
self.target ||= []
|
121
|
+
end
|
122
|
+
|
123
|
+
def last_group
|
124
|
+
groups.last
|
125
|
+
end
|
126
|
+
|
127
|
+
def newline?
|
128
|
+
current_line.end_with?("\n")
|
129
|
+
end
|
130
|
+
end # module Directives
|
131
|
+
end # class TextExtractor
|
@@ -0,0 +1,73 @@
|
|
1
|
+
class TextExtractor
|
2
|
+
class Directives
|
3
|
+
# base class for line directives
|
4
|
+
class Directive
|
5
|
+
attr_reader :state
|
6
|
+
|
7
|
+
def initialize(state, argument = nil)
|
8
|
+
@state = state
|
9
|
+
@argument = argument
|
10
|
+
init if respond_to?(:init)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
# open a line group
|
15
|
+
class Begin < Directive
|
16
|
+
def init
|
17
|
+
type = case @argument
|
18
|
+
when '', nil
|
19
|
+
'?:'
|
20
|
+
when '?:'
|
21
|
+
''
|
22
|
+
else
|
23
|
+
@argument
|
24
|
+
end
|
25
|
+
@group = group(type)
|
26
|
+
end
|
27
|
+
|
28
|
+
def group(*args)
|
29
|
+
Group.new(*args)
|
30
|
+
end
|
31
|
+
|
32
|
+
def call
|
33
|
+
state.current = nil
|
34
|
+
state.groups.push @group
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# alternating capture group
|
39
|
+
class Any < Begin
|
40
|
+
def group(*args)
|
41
|
+
AnyGroup.new(*args)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# text that will be omitted from the regexp
|
46
|
+
class Comment < Directive
|
47
|
+
def call
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# close a line group
|
52
|
+
class End < Directive
|
53
|
+
def call
|
54
|
+
state.current = state.groups.pop.finish(state.newline?)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# current line or group occurs 0 or 1 times
|
59
|
+
class Maybe < Directive
|
60
|
+
def call
|
61
|
+
state.current = ['(?:', state.current, ')?']
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# repetition
|
66
|
+
class Repeat < Directive
|
67
|
+
def call
|
68
|
+
@argument ||= '0,'
|
69
|
+
state.current = ['(?:', state.current, "){#{@argument}}"]
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
class TextExtractor
|
2
|
+
class Directives
|
3
|
+
# a line group
|
4
|
+
class Group
|
5
|
+
def initialize(type, *args)
|
6
|
+
@type = type
|
7
|
+
@lines = args
|
8
|
+
end
|
9
|
+
|
10
|
+
def <<(item)
|
11
|
+
@lines << item
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_a
|
15
|
+
@lines
|
16
|
+
end
|
17
|
+
|
18
|
+
def chomp(newline)
|
19
|
+
return if @lines.empty? || newline
|
20
|
+
tail = @lines[-1]
|
21
|
+
if tail.is_a?(Array)
|
22
|
+
tail = tail[-1] while tail[-1].is_a?(Array)
|
23
|
+
tail[-2] = tail[-2].chomp
|
24
|
+
else
|
25
|
+
@lines[-1] = @lines[-1].chomp
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def finish(newline)
|
30
|
+
chomp(newline)
|
31
|
+
join
|
32
|
+
end
|
33
|
+
|
34
|
+
def join
|
35
|
+
["(#{@type}", *@lines, ')']
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# a line group where each line (or subgroup) is an alternative
|
40
|
+
class AnyGroup < Group
|
41
|
+
def join
|
42
|
+
['(?:', *@lines.flat_map { |e| [e, '|'] }[0..-2], ')']
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -5,6 +5,7 @@ class TextExtractor
|
|
5
5
|
def initialize(regexp, factory: nil, values: [], fill: [])
|
6
6
|
@regexp = regexp
|
7
7
|
@factory = factory
|
8
|
+
@constructor = FactoryAnalyzer.new(factory).to_proc
|
8
9
|
@values = values.map { |val| [val.id, val] }.to_h
|
9
10
|
@default_values = values.map { |val| [val.id, nil] }.to_h
|
10
11
|
@fill = Array(fill)
|
@@ -18,37 +19,8 @@ class TextExtractor
|
|
18
19
|
end
|
19
20
|
|
20
21
|
def build_extraction(extracted)
|
21
|
-
|
22
|
-
|
23
|
-
build_extraction_by_hash(extracted)
|
24
|
-
when Set
|
25
|
-
build_extraction_by_set(extracted)
|
26
|
-
when Class
|
27
|
-
build_extraction_by_class(extracted)
|
28
|
-
else
|
29
|
-
extracted
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
def build_extraction_by_hash(extracted)
|
34
|
-
klass, params = factory.first
|
35
|
-
klass.new(*extracted.values_at(*params))
|
36
|
-
end
|
37
|
-
|
38
|
-
def build_extraction_by_set(extracted)
|
39
|
-
klass, params = factory.first
|
40
|
-
values = params.each_with_object({}) do |param, hash|
|
41
|
-
hash[param] = extracted[param]
|
42
|
-
end
|
43
|
-
klass.new(**values)
|
44
|
-
end
|
45
|
-
|
46
|
-
def build_extraction_by_class(extracted)
|
47
|
-
if factory.ancestors.include?(Struct)
|
48
|
-
factory.new(*extracted.values)
|
49
|
-
else
|
50
|
-
factory.new(**extracted)
|
51
|
-
end
|
22
|
+
return extracted unless @constructor
|
23
|
+
@constructor.call(extracted)
|
52
24
|
end
|
53
25
|
|
54
26
|
def match(string, pos = 0)
|
@@ -70,5 +42,59 @@ class TextExtractor
|
|
70
42
|
def extract_values(match)
|
71
43
|
values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
|
72
44
|
end
|
45
|
+
|
46
|
+
# converts the value of the factory option to a constructor proc
|
47
|
+
class FactoryAnalyzer
|
48
|
+
def initialize(factory)
|
49
|
+
case factory
|
50
|
+
when Hash
|
51
|
+
@klass, @params = factory.first
|
52
|
+
else
|
53
|
+
@klass = factory
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_proc
|
58
|
+
if @params
|
59
|
+
explicit
|
60
|
+
elsif @klass.is_a?(Proc)
|
61
|
+
@klass
|
62
|
+
elsif @klass
|
63
|
+
implicit
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def explicit
|
70
|
+
case @params
|
71
|
+
when Array
|
72
|
+
positional
|
73
|
+
when Set
|
74
|
+
keyword
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def positional
|
79
|
+
->(extracted) { @klass.new(*extracted.values_at(*@params)) }
|
80
|
+
end
|
81
|
+
|
82
|
+
def keyword
|
83
|
+
lambda do |extracted|
|
84
|
+
values = @params.each_with_object({}) do |param, hash|
|
85
|
+
hash[param] = extracted[param]
|
86
|
+
end
|
87
|
+
@klass.new(**values)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def implicit
|
92
|
+
if @klass.ancestors.include?(Struct)
|
93
|
+
->(extracted) { @klass.new(*extracted.values) }
|
94
|
+
else
|
95
|
+
->(extracted) { @klass.new(**extracted) }
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end # class FactoryAnalyzer
|
73
99
|
end # class Record
|
74
100
|
end # class TextExtractor
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -59,6 +59,9 @@ extensions: []
|
|
59
59
|
extra_rdoc_files: []
|
60
60
|
files:
|
61
61
|
- lib/text_extractor.rb
|
62
|
+
- lib/text_extractor/directives.rb
|
63
|
+
- lib/text_extractor/directives/classes.rb
|
64
|
+
- lib/text_extractor/directives/group.rb
|
62
65
|
- lib/text_extractor/extraction.rb
|
63
66
|
- lib/text_extractor/filldown.rb
|
64
67
|
- lib/text_extractor/record.rb
|