text_extractor 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7d29f56c023ab50d2bcf9f40869e78febd4fbbb8
4
- data.tar.gz: 87da343a092a575b7683bfca3378ed3eeabff820
3
+ metadata.gz: 516fd52deaf25b6e67241cd40b55580a43227247
4
+ data.tar.gz: b776cac3194257f826d8671aef2cf1991a075ae2
5
5
  SHA512:
6
- metadata.gz: edcafad5fed934fde4b68b7b22058c600ecdadcbbcf629634285951860740f7a25136de3d133f075bce0a50cdc6bc7265c2b7ff14712ac495885ecc3c5c62741
7
- data.tar.gz: b09236449430419201b8e9f3a359f10b0933e0d2584d44b2b424137b2f733e6aa35b95172663f85ff4b2fe2cc68bb1af4b80771161a96d92ce205e1afe4faa3b
6
+ metadata.gz: 28cf87f08c5c04cc2d11c8576692f15919925360e6b5b31459d2319e4b5a3904a7925f87cc03749947a42cb21f8ce0f21759eb98a57376d7115725e6cba288be
7
+ data.tar.gz: fa79da2fbd314b46ea0343cf7647c23f843a98ad27a0e04b69a665b6f872ab02384bc7d555b2229aaf3207e082c60522f9dfffebcefc1ab596b8ad3d871604ff
@@ -68,13 +68,17 @@ class TextExtractor
68
68
  def strip_record(regexp, strip: nil)
69
69
  lines = regexp.source.split("\n")
70
70
  prefix = lines.last
71
- strip_record_by_line(lines, prefix, strip)
72
- Regexp.new(lines.join("\n").strip, regexp.options)
71
+
72
+ if prefix =~ /\A\s*\z/
73
+ lines.pop if lines.first =~ /\A\s*\z/
74
+ lines.shift
75
+ strip_record_by_line(lines, prefix, strip)
76
+ end
77
+
78
+ Regexp.new(lines.join("\n"), regexp.options)
73
79
  end
74
80
 
75
81
  def strip_record_by_line(lines, prefix, strip)
76
- return unless prefix =~ /\A\s*\z/
77
-
78
82
  lines.map! { |s| s.gsub(prefix.to_s, '') }
79
83
  case strip
80
84
  when :left then lines.map! { |s| "\[ \t\r\f]*#{s.lstrip}" }
@@ -0,0 +1,131 @@
1
+ require 'strscan'
2
+
3
+ require 'text_extractor/directives/classes'
4
+ require 'text_extractor/directives/group'
5
+
6
+ class TextExtractor
7
+ def self.expand_directives(re)
8
+ Directives.new(re).expand
9
+ end
10
+
11
+ # Directives can only be named with lowercase ascii letters (a-z) and _
12
+ # (underscore).
13
+ #
14
+ # Directives can take an argument. An argument can contain any sequence of
15
+ # characters other than newlines, parenthesis, or dot (.). The argument
16
+ # appears after the name, in parenthesis, with no whitespace between the name
17
+ # and left parenthesis. Whitespace inside the parenthesis is taken literally
18
+ # and not ignored.
19
+ #
20
+ # When used, each directive name is preceeded by a dot (.). There should be no
21
+ # whitespace on either side of the dot. Some directives can be chained one
22
+ # after another, still using a dot to separate the earlier directive from the
23
+ # later one.
24
+ class Directives
25
+ def initialize(original)
26
+ @source = original.source
27
+ @options = original.options
28
+ end
29
+
30
+ def expand
31
+ return @output if @output
32
+ @state = State.new
33
+ scanner = StringScanner.new(@source)
34
+ read_line(scanner) until scanner.eos?
35
+ raise 'Unterminated line group' unless @state.groups.empty?
36
+ @output = Regexp.new(@state.target.join(''), @options)
37
+ end
38
+
39
+ private
40
+
41
+ DIRECTIVE_MAP = {
42
+ ' ' => { class: Comment, arguments: ->(source) { [source[1..-1]] } },
43
+ 'any' => { class: Any },
44
+ 'begin' => { class: Begin, arguments: :parsed },
45
+ 'end' => { class: End },
46
+ 'maybe' => { class: Maybe },
47
+ 'repeat' => { class: Repeat, arguments: :parse }
48
+ }.freeze
49
+ private_constant :DIRECTIVE_MAP
50
+
51
+ def read_line(scanner)
52
+ line = scanner.scan_until(/\n/)
53
+
54
+ unless line
55
+ line = scanner.rest
56
+ scanner.skip(/.*/)
57
+ end
58
+
59
+ @state.current = @state.current_line = line
60
+ add_line
61
+ end
62
+
63
+ def add_line
64
+ apply_directives read_directives
65
+ return unless @state.current
66
+
67
+ if @state.groups.empty?
68
+ @state.target << @state.current
69
+ else
70
+ @state.groups.last << @state.current
71
+ end
72
+ end
73
+
74
+ def read_directives
75
+ md = @state.current_line.match(/(^| )#\./)
76
+
77
+ if md
78
+ @state.current = md.pre_match
79
+ @state.current += "\n" if @state.newline?
80
+ parse_directives(md.post_match.rstrip)
81
+ else
82
+ []
83
+ end
84
+ end
85
+
86
+ def apply_directives(directives)
87
+ directives.each(&:call)
88
+ end
89
+
90
+ def parse_directives(full_source)
91
+ return [Comment.new(@state)] if full_source.start_with?(' ')
92
+ split_directives(full_source)
93
+ .map { |source| parse_one_directive(source) }
94
+ end
95
+
96
+ def parse_one_directive(source)
97
+ md = source.match(/^[a-z_]+/)
98
+ word = md[0]
99
+ raise "Unknown directive(s) #{source}" unless md
100
+ map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
101
+ args = parse_arguments(map[:arguments], md.post_match)
102
+ map.fetch(:class).new(@state, *args)
103
+ end
104
+
105
+ def split_directives(source)
106
+ source.split('.')
107
+ end
108
+
109
+ def parse_arguments(rule, source)
110
+ return [] unless rule
111
+ return rule.call(source) if rule.is_a?(Proc)
112
+ source.match(/\(([^)]*)\)/) { |md| md[1] }
113
+ end
114
+ end # class Expander
115
+
116
+ State = Struct.new(:current, :current_line, :groups, :target) do
117
+ def initialize(*)
118
+ super
119
+ self.groups ||= []
120
+ self.target ||= []
121
+ end
122
+
123
+ def last_group
124
+ groups.last
125
+ end
126
+
127
+ def newline?
128
+ current_line.end_with?("\n")
129
+ end
130
+ end # module Directives
131
+ end # class TextExtractor
@@ -0,0 +1,73 @@
1
+ class TextExtractor
2
+ class Directives
3
+ # base class for line directives
4
+ class Directive
5
+ attr_reader :state
6
+
7
+ def initialize(state, argument = nil)
8
+ @state = state
9
+ @argument = argument
10
+ init if respond_to?(:init)
11
+ end
12
+ end
13
+
14
+ # open a line group
15
+ class Begin < Directive
16
+ def init
17
+ type = case @argument
18
+ when '', nil
19
+ '?:'
20
+ when '?:'
21
+ ''
22
+ else
23
+ @argument
24
+ end
25
+ @group = group(type)
26
+ end
27
+
28
+ def group(*args)
29
+ Group.new(*args)
30
+ end
31
+
32
+ def call
33
+ state.current = nil
34
+ state.groups.push @group
35
+ end
36
+ end
37
+
38
+ # alternating capture group
39
+ class Any < Begin
40
+ def group(*args)
41
+ AnyGroup.new(*args)
42
+ end
43
+ end
44
+
45
+ # text that will be omitted from the regexp
46
+ class Comment < Directive
47
+ def call
48
+ end
49
+ end
50
+
51
+ # close a line group
52
+ class End < Directive
53
+ def call
54
+ state.current = state.groups.pop.finish(state.newline?)
55
+ end
56
+ end
57
+
58
+ # current line or group occurs 0 or 1 times
59
+ class Maybe < Directive
60
+ def call
61
+ state.current = ['(?:', state.current, ')?']
62
+ end
63
+ end
64
+
65
+ # repetition
66
+ class Repeat < Directive
67
+ def call
68
+ @argument ||= '0,'
69
+ state.current = ['(?:', state.current, "){#{@argument}}"]
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,46 @@
1
+ class TextExtractor
2
+ class Directives
3
+ # a line group
4
+ class Group
5
+ def initialize(type, *args)
6
+ @type = type
7
+ @lines = args
8
+ end
9
+
10
+ def <<(item)
11
+ @lines << item
12
+ end
13
+
14
+ def to_a
15
+ @lines
16
+ end
17
+
18
+ def chomp(newline)
19
+ return if @lines.empty? || newline
20
+ tail = @lines[-1]
21
+ if tail.is_a?(Array)
22
+ tail = tail[-1] while tail[-1].is_a?(Array)
23
+ tail[-2] = tail[-2].chomp
24
+ else
25
+ @lines[-1] = @lines[-1].chomp
26
+ end
27
+ end
28
+
29
+ def finish(newline)
30
+ chomp(newline)
31
+ join
32
+ end
33
+
34
+ def join
35
+ ["(#{@type}", *@lines, ')']
36
+ end
37
+ end
38
+
39
+ # a line group where each line (or subgroup) is an alternative
40
+ class AnyGroup < Group
41
+ def join
42
+ ['(?:', *@lines.flat_map { |e| [e, '|'] }[0..-2], ')']
43
+ end
44
+ end
45
+ end
46
+ end
@@ -5,6 +5,7 @@ class TextExtractor
5
5
  def initialize(regexp, factory: nil, values: [], fill: [])
6
6
  @regexp = regexp
7
7
  @factory = factory
8
+ @constructor = FactoryAnalyzer.new(factory).to_proc
8
9
  @values = values.map { |val| [val.id, val] }.to_h
9
10
  @default_values = values.map { |val| [val.id, nil] }.to_h
10
11
  @fill = Array(fill)
@@ -18,37 +19,8 @@ class TextExtractor
18
19
  end
19
20
 
20
21
  def build_extraction(extracted)
21
- case factory
22
- when Hash
23
- build_extraction_by_hash(extracted)
24
- when Set
25
- build_extraction_by_set(extracted)
26
- when Class
27
- build_extraction_by_class(extracted)
28
- else
29
- extracted
30
- end
31
- end
32
-
33
- def build_extraction_by_hash(extracted)
34
- klass, params = factory.first
35
- klass.new(*extracted.values_at(*params))
36
- end
37
-
38
- def build_extraction_by_set(extracted)
39
- klass, params = factory.first
40
- values = params.each_with_object({}) do |param, hash|
41
- hash[param] = extracted[param]
42
- end
43
- klass.new(**values)
44
- end
45
-
46
- def build_extraction_by_class(extracted)
47
- if factory.ancestors.include?(Struct)
48
- factory.new(*extracted.values)
49
- else
50
- factory.new(**extracted)
51
- end
22
+ return extracted unless @constructor
23
+ @constructor.call(extracted)
52
24
  end
53
25
 
54
26
  def match(string, pos = 0)
@@ -70,5 +42,59 @@ class TextExtractor
70
42
  def extract_values(match)
71
43
  values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
72
44
  end
45
+
46
+ # converts the value of the factory option to a constructor proc
47
+ class FactoryAnalyzer
48
+ def initialize(factory)
49
+ case factory
50
+ when Hash
51
+ @klass, @params = factory.first
52
+ else
53
+ @klass = factory
54
+ end
55
+ end
56
+
57
+ def to_proc
58
+ if @params
59
+ explicit
60
+ elsif @klass.is_a?(Proc)
61
+ @klass
62
+ elsif @klass
63
+ implicit
64
+ end
65
+ end
66
+
67
+ private
68
+
69
+ def explicit
70
+ case @params
71
+ when Array
72
+ positional
73
+ when Set
74
+ keyword
75
+ end
76
+ end
77
+
78
+ def positional
79
+ ->(extracted) { @klass.new(*extracted.values_at(*@params)) }
80
+ end
81
+
82
+ def keyword
83
+ lambda do |extracted|
84
+ values = @params.each_with_object({}) do |param, hash|
85
+ hash[param] = extracted[param]
86
+ end
87
+ @klass.new(**values)
88
+ end
89
+ end
90
+
91
+ def implicit
92
+ if @klass.ancestors.include?(Struct)
93
+ ->(extracted) { @klass.new(*extracted.values) }
94
+ else
95
+ ->(extracted) { @klass.new(**extracted) }
96
+ end
97
+ end
98
+ end # class FactoryAnalyzer
73
99
  end # class Record
74
100
  end # class TextExtractor
@@ -1,5 +1,5 @@
1
1
  class TextExtractor
2
2
  def self.version
3
- '0.1.4'
3
+ '0.1.5'
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: text_extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben Miller
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-17 00:00:00.000000000 Z
11
+ date: 2016-03-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -59,6 +59,9 @@ extensions: []
59
59
  extra_rdoc_files: []
60
60
  files:
61
61
  - lib/text_extractor.rb
62
+ - lib/text_extractor/directives.rb
63
+ - lib/text_extractor/directives/classes.rb
64
+ - lib/text_extractor/directives/group.rb
62
65
  - lib/text_extractor/extraction.rb
63
66
  - lib/text_extractor/filldown.rb
64
67
  - lib/text_extractor/record.rb