text_extractor 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7d29f56c023ab50d2bcf9f40869e78febd4fbbb8
4
- data.tar.gz: 87da343a092a575b7683bfca3378ed3eeabff820
3
+ metadata.gz: 516fd52deaf25b6e67241cd40b55580a43227247
4
+ data.tar.gz: b776cac3194257f826d8671aef2cf1991a075ae2
5
5
  SHA512:
6
- metadata.gz: edcafad5fed934fde4b68b7b22058c600ecdadcbbcf629634285951860740f7a25136de3d133f075bce0a50cdc6bc7265c2b7ff14712ac495885ecc3c5c62741
7
- data.tar.gz: b09236449430419201b8e9f3a359f10b0933e0d2584d44b2b424137b2f733e6aa35b95172663f85ff4b2fe2cc68bb1af4b80771161a96d92ce205e1afe4faa3b
6
+ metadata.gz: 28cf87f08c5c04cc2d11c8576692f15919925360e6b5b31459d2319e4b5a3904a7925f87cc03749947a42cb21f8ce0f21759eb98a57376d7115725e6cba288be
7
+ data.tar.gz: fa79da2fbd314b46ea0343cf7647c23f843a98ad27a0e04b69a665b6f872ab02384bc7d555b2229aaf3207e082c60522f9dfffebcefc1ab596b8ad3d871604ff
@@ -68,13 +68,17 @@ class TextExtractor
68
68
  def strip_record(regexp, strip: nil)
69
69
  lines = regexp.source.split("\n")
70
70
  prefix = lines.last
71
- strip_record_by_line(lines, prefix, strip)
72
- Regexp.new(lines.join("\n").strip, regexp.options)
71
+
72
+ if prefix =~ /\A\s*\z/
73
+ lines.pop if lines.first =~ /\A\s*\z/
74
+ lines.shift
75
+ strip_record_by_line(lines, prefix, strip)
76
+ end
77
+
78
+ Regexp.new(lines.join("\n"), regexp.options)
73
79
  end
74
80
 
75
81
  def strip_record_by_line(lines, prefix, strip)
76
- return unless prefix =~ /\A\s*\z/
77
-
78
82
  lines.map! { |s| s.gsub(prefix.to_s, '') }
79
83
  case strip
80
84
  when :left then lines.map! { |s| "\[ \t\r\f]*#{s.lstrip}" }
@@ -0,0 +1,131 @@
1
+ require 'strscan'
2
+
3
+ require 'text_extractor/directives/classes'
4
+ require 'text_extractor/directives/group'
5
+
6
+ class TextExtractor
7
+ def self.expand_directives(re)
8
+ Directives.new(re).expand
9
+ end
10
+
11
+ # Directives can only be named with lowercase ascii letters (a-z) and _
12
+ # (underscore).
13
+ #
14
+ # Directives can take an argument. An argument can contain any sequence of
15
+ # characters other than newlines, parenthesis, or dot (.). The argument
16
+ # appears after the name, in parenthesis, with no whitespace between the name
17
+ # and left parenthesis. Whitespace inside the parenthesis is taken literally
18
+ # and not ignored.
19
+ #
20
+ # When used, each directive name is preceeded by a dot (.). There should be no
21
+ # whitespace on either side of the dot. Some directives can be chained one
22
+ # after another, still using a dot to separate the earlier directive from the
23
+ # later one.
24
+ class Directives
25
+ def initialize(original)
26
+ @source = original.source
27
+ @options = original.options
28
+ end
29
+
30
+ def expand
31
+ return @output if @output
32
+ @state = State.new
33
+ scanner = StringScanner.new(@source)
34
+ read_line(scanner) until scanner.eos?
35
+ raise 'Unterminated line group' unless @state.groups.empty?
36
+ @output = Regexp.new(@state.target.join(''), @options)
37
+ end
38
+
39
+ private
40
+
41
+ DIRECTIVE_MAP = {
42
+ ' ' => { class: Comment, arguments: ->(source) { [source[1..-1]] } },
43
+ 'any' => { class: Any },
44
+ 'begin' => { class: Begin, arguments: :parsed },
45
+ 'end' => { class: End },
46
+ 'maybe' => { class: Maybe },
47
+ 'repeat' => { class: Repeat, arguments: :parse }
48
+ }.freeze
49
+ private_constant :DIRECTIVE_MAP
50
+
51
+ def read_line(scanner)
52
+ line = scanner.scan_until(/\n/)
53
+
54
+ unless line
55
+ line = scanner.rest
56
+ scanner.skip(/.*/)
57
+ end
58
+
59
+ @state.current = @state.current_line = line
60
+ add_line
61
+ end
62
+
63
+ def add_line
64
+ apply_directives read_directives
65
+ return unless @state.current
66
+
67
+ if @state.groups.empty?
68
+ @state.target << @state.current
69
+ else
70
+ @state.groups.last << @state.current
71
+ end
72
+ end
73
+
74
+ def read_directives
75
+ md = @state.current_line.match(/(^| )#\./)
76
+
77
+ if md
78
+ @state.current = md.pre_match
79
+ @state.current += "\n" if @state.newline?
80
+ parse_directives(md.post_match.rstrip)
81
+ else
82
+ []
83
+ end
84
+ end
85
+
86
+ def apply_directives(directives)
87
+ directives.each(&:call)
88
+ end
89
+
90
+ def parse_directives(full_source)
91
+ return [Comment.new(@state)] if full_source.start_with?(' ')
92
+ split_directives(full_source)
93
+ .map { |source| parse_one_directive(source) }
94
+ end
95
+
96
+ def parse_one_directive(source)
97
+ md = source.match(/^[a-z_]+/)
98
+ word = md[0]
99
+ raise "Unknown directive(s) #{source}" unless md
100
+ map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
101
+ args = parse_arguments(map[:arguments], md.post_match)
102
+ map.fetch(:class).new(@state, *args)
103
+ end
104
+
105
+ def split_directives(source)
106
+ source.split('.')
107
+ end
108
+
109
+ def parse_arguments(rule, source)
110
+ return [] unless rule
111
+ return rule.call(source) if rule.is_a?(Proc)
112
+ source.match(/\(([^)]*)\)/) { |md| md[1] }
113
+ end
114
+ end # class Expander
115
+
116
+ State = Struct.new(:current, :current_line, :groups, :target) do
117
+ def initialize(*)
118
+ super
119
+ self.groups ||= []
120
+ self.target ||= []
121
+ end
122
+
123
+ def last_group
124
+ groups.last
125
+ end
126
+
127
+ def newline?
128
+ current_line.end_with?("\n")
129
+ end
130
+ end # module Directives
131
+ end # class TextExtractor
@@ -0,0 +1,73 @@
1
+ class TextExtractor
2
+ class Directives
3
+ # base class for line directives
4
+ class Directive
5
+ attr_reader :state
6
+
7
+ def initialize(state, argument = nil)
8
+ @state = state
9
+ @argument = argument
10
+ init if respond_to?(:init)
11
+ end
12
+ end
13
+
14
+ # open a line group
15
+ class Begin < Directive
16
+ def init
17
+ type = case @argument
18
+ when '', nil
19
+ '?:'
20
+ when '?:'
21
+ ''
22
+ else
23
+ @argument
24
+ end
25
+ @group = group(type)
26
+ end
27
+
28
+ def group(*args)
29
+ Group.new(*args)
30
+ end
31
+
32
+ def call
33
+ state.current = nil
34
+ state.groups.push @group
35
+ end
36
+ end
37
+
38
+ # alternating capture group
39
+ class Any < Begin
40
+ def group(*args)
41
+ AnyGroup.new(*args)
42
+ end
43
+ end
44
+
45
+ # text that will be omitted from the regexp
46
+ class Comment < Directive
47
+ def call
48
+ end
49
+ end
50
+
51
+ # close a line group
52
+ class End < Directive
53
+ def call
54
+ state.current = state.groups.pop.finish(state.newline?)
55
+ end
56
+ end
57
+
58
+ # current line or group occurs 0 or 1 times
59
+ class Maybe < Directive
60
+ def call
61
+ state.current = ['(?:', state.current, ')?']
62
+ end
63
+ end
64
+
65
+ # repetition
66
+ class Repeat < Directive
67
+ def call
68
+ @argument ||= '0,'
69
+ state.current = ['(?:', state.current, "){#{@argument}}"]
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,46 @@
1
+ class TextExtractor
2
+ class Directives
3
+ # a line group
4
+ class Group
5
+ def initialize(type, *args)
6
+ @type = type
7
+ @lines = args
8
+ end
9
+
10
+ def <<(item)
11
+ @lines << item
12
+ end
13
+
14
+ def to_a
15
+ @lines
16
+ end
17
+
18
+ def chomp(newline)
19
+ return if @lines.empty? || newline
20
+ tail = @lines[-1]
21
+ if tail.is_a?(Array)
22
+ tail = tail[-1] while tail[-1].is_a?(Array)
23
+ tail[-2] = tail[-2].chomp
24
+ else
25
+ @lines[-1] = @lines[-1].chomp
26
+ end
27
+ end
28
+
29
+ def finish(newline)
30
+ chomp(newline)
31
+ join
32
+ end
33
+
34
+ def join
35
+ ["(#{@type}", *@lines, ')']
36
+ end
37
+ end
38
+
39
+ # a line group where each line (or subgroup) is an alternative
40
+ class AnyGroup < Group
41
+ def join
42
+ ['(?:', *@lines.flat_map { |e| [e, '|'] }[0..-2], ')']
43
+ end
44
+ end
45
+ end
46
+ end
@@ -5,6 +5,7 @@ class TextExtractor
5
5
  def initialize(regexp, factory: nil, values: [], fill: [])
6
6
  @regexp = regexp
7
7
  @factory = factory
8
+ @constructor = FactoryAnalyzer.new(factory).to_proc
8
9
  @values = values.map { |val| [val.id, val] }.to_h
9
10
  @default_values = values.map { |val| [val.id, nil] }.to_h
10
11
  @fill = Array(fill)
@@ -18,37 +19,8 @@ class TextExtractor
18
19
  end
19
20
 
20
21
  def build_extraction(extracted)
21
- case factory
22
- when Hash
23
- build_extraction_by_hash(extracted)
24
- when Set
25
- build_extraction_by_set(extracted)
26
- when Class
27
- build_extraction_by_class(extracted)
28
- else
29
- extracted
30
- end
31
- end
32
-
33
- def build_extraction_by_hash(extracted)
34
- klass, params = factory.first
35
- klass.new(*extracted.values_at(*params))
36
- end
37
-
38
- def build_extraction_by_set(extracted)
39
- klass, params = factory.first
40
- values = params.each_with_object({}) do |param, hash|
41
- hash[param] = extracted[param]
42
- end
43
- klass.new(**values)
44
- end
45
-
46
- def build_extraction_by_class(extracted)
47
- if factory.ancestors.include?(Struct)
48
- factory.new(*extracted.values)
49
- else
50
- factory.new(**extracted)
51
- end
22
+ return extracted unless @constructor
23
+ @constructor.call(extracted)
52
24
  end
53
25
 
54
26
  def match(string, pos = 0)
@@ -70,5 +42,59 @@ class TextExtractor
70
42
  def extract_values(match)
71
43
  values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
72
44
  end
45
+
46
+ # converts the value of the factory option to a constructor proc
47
+ class FactoryAnalyzer
48
+ def initialize(factory)
49
+ case factory
50
+ when Hash
51
+ @klass, @params = factory.first
52
+ else
53
+ @klass = factory
54
+ end
55
+ end
56
+
57
+ def to_proc
58
+ if @params
59
+ explicit
60
+ elsif @klass.is_a?(Proc)
61
+ @klass
62
+ elsif @klass
63
+ implicit
64
+ end
65
+ end
66
+
67
+ private
68
+
69
+ def explicit
70
+ case @params
71
+ when Array
72
+ positional
73
+ when Set
74
+ keyword
75
+ end
76
+ end
77
+
78
+ def positional
79
+ ->(extracted) { @klass.new(*extracted.values_at(*@params)) }
80
+ end
81
+
82
+ def keyword
83
+ lambda do |extracted|
84
+ values = @params.each_with_object({}) do |param, hash|
85
+ hash[param] = extracted[param]
86
+ end
87
+ @klass.new(**values)
88
+ end
89
+ end
90
+
91
+ def implicit
92
+ if @klass.ancestors.include?(Struct)
93
+ ->(extracted) { @klass.new(*extracted.values) }
94
+ else
95
+ ->(extracted) { @klass.new(**extracted) }
96
+ end
97
+ end
98
+ end # class FactoryAnalyzer
73
99
  end # class Record
74
100
  end # class TextExtractor
@@ -1,5 +1,5 @@
1
1
  class TextExtractor
2
2
  def self.version
3
- '0.1.4'
3
+ '0.1.5'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Miller
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-17 00:00:00.000000000 Z
11
+ date: 2016-03-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -59,6 +59,9 @@ extensions: []
59
59
  extra_rdoc_files: []
60
60
  files:
61
61
  - lib/text_extractor.rb
62
+ - lib/text_extractor/directives.rb
63
+ - lib/text_extractor/directives/classes.rb
64
+ - lib/text_extractor/directives/group.rb
62
65
  - lib/text_extractor/extraction.rb
63
66
  - lib/text_extractor/filldown.rb
64
67
  - lib/text_extractor/record.rb