RubyGems - text_extractor - Versions diffs - 0.1.4 → 0.1.5 - Mend

text_extractor 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/lib/text_extractor.rb +8 -4
data/lib/text_extractor/directives.rb +131 -0
data/lib/text_extractor/directives/classes.rb +73 -0
data/lib/text_extractor/directives/group.rb +46 -0
data/lib/text_extractor/record.rb +57 -31
data/lib/text_extractor/version.rb +1 -1
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7d29f56c023ab50d2bcf9f40869e78febd4fbbb8
-  data.tar.gz: 87da343a092a575b7683bfca3378ed3eeabff820
+  metadata.gz: 516fd52deaf25b6e67241cd40b55580a43227247
+  data.tar.gz: b776cac3194257f826d8671aef2cf1991a075ae2
 SHA512:
-  metadata.gz: edcafad5fed934fde4b68b7b22058c600ecdadcbbcf629634285951860740f7a25136de3d133f075bce0a50cdc6bc7265c2b7ff14712ac495885ecc3c5c62741
-  data.tar.gz: b09236449430419201b8e9f3a359f10b0933e0d2584d44b2b424137b2f733e6aa35b95172663f85ff4b2fe2cc68bb1af4b80771161a96d92ce205e1afe4faa3b
+  metadata.gz: 28cf87f08c5c04cc2d11c8576692f15919925360e6b5b31459d2319e4b5a3904a7925f87cc03749947a42cb21f8ce0f21759eb98a57376d7115725e6cba288be
+  data.tar.gz: fa79da2fbd314b46ea0343cf7647c23f843a98ad27a0e04b69a665b6f872ab02384bc7d555b2229aaf3207e082c60522f9dfffebcefc1ab596b8ad3d871604ff

data/lib/text_extractor.rb CHANGED Viewed

@@ -68,13 +68,17 @@ class TextExtractor
   def strip_record(regexp, strip: nil)
     lines = regexp.source.split("\n")
     prefix = lines.last
-    strip_record_by_line(lines, prefix, strip)
-    Regexp.new(lines.join("\n").strip, regexp.options)
+    if prefix =~ /\A\s*\z/
+      lines.pop if lines.first =~ /\A\s*\z/
+      lines.shift
+      strip_record_by_line(lines, prefix, strip)
+    end
+    Regexp.new(lines.join("\n"), regexp.options)
   end
   def strip_record_by_line(lines, prefix, strip)
-    return unless prefix =~ /\A\s*\z/
     lines.map! { |s| s.gsub(prefix.to_s, '') }
     case strip
     when :left  then lines.map! { |s| "\[ \t\r\f]*#{s.lstrip}" }

data/lib/text_extractor/directives.rb ADDED Viewed

@@ -0,0 +1,131 @@
+require 'strscan'
+require 'text_extractor/directives/classes'
+require 'text_extractor/directives/group'
+class TextExtractor
+  def self.expand_directives(re)
+    Directives.new(re).expand
+  end
+  # Directives can only be named with lowercase ascii letters (a-z) and _
+  # (underscore).
+  #
+  # Directives can take an argument. An argument can contain any sequence of
+  # characters other than newlines, parenthesis, or dot (.). The argument
+  # appears after the name, in parenthesis, with no whitespace between the name
+  # and left parenthesis. Whitespace inside the parenthesis is taken literally
+  # and not ignored.
+  #
+  # When used, each directive name is preceeded by a dot (.). There should be no
+  # whitespace on either side of the dot. Some directives can be chained one
+  # after another, still using a dot to separate the earlier directive from the
+  # later one.
+  class Directives
+    def initialize(original)
+      @source = original.source
+      @options = original.options
+    end
+    def expand
+      return @output if @output
+      @state = State.new
+      scanner = StringScanner.new(@source)
+      read_line(scanner) until scanner.eos?
+      raise 'Unterminated line group' unless @state.groups.empty?
+      @output = Regexp.new(@state.target.join(''), @options)
+    end
+    private
+    DIRECTIVE_MAP = {
+      ' '      => { class: Comment, arguments: ->(source) { [source[1..-1]] } },
+      'any'    => { class: Any },
+      'begin'  => { class: Begin, arguments: :parsed },
+      'end'    => { class: End },
+      'maybe'  => { class: Maybe },
+      'repeat' => { class: Repeat, arguments: :parse }
+    }.freeze
+    private_constant :DIRECTIVE_MAP
+    def read_line(scanner)
+      line = scanner.scan_until(/\n/)
+      unless line
+        line = scanner.rest
+        scanner.skip(/.*/)
+      end
+      @state.current = @state.current_line = line
+      add_line
+    end
+    def add_line
+      apply_directives read_directives
+      return unless @state.current
+      if @state.groups.empty?
+        @state.target << @state.current
+      else
+        @state.groups.last << @state.current
+      end
+    end
+    def read_directives
+      md = @state.current_line.match(/(^| )#\./)
+      if md
+        @state.current = md.pre_match
+        @state.current += "\n" if @state.newline?
+        parse_directives(md.post_match.rstrip)
+      else
+        []
+      end
+    end
+    def apply_directives(directives)
+      directives.each(&:call)
+    end
+    def parse_directives(full_source)
+      return [Comment.new(@state)] if full_source.start_with?(' ')
+      split_directives(full_source)
+        .map { |source| parse_one_directive(source) }
+    end
+    def parse_one_directive(source)
+      md = source.match(/^[a-z_]+/)
+      word = md[0]
+      raise "Unknown directive(s) #{source}" unless md
+      map = DIRECTIVE_MAP.fetch(word) { raise "Unknown directive #{word}" }
+      args = parse_arguments(map[:arguments], md.post_match)
+      map.fetch(:class).new(@state, *args)
+    end
+    def split_directives(source)
+      source.split('.')
+    end
+    def parse_arguments(rule, source)
+      return [] unless rule
+      return rule.call(source) if rule.is_a?(Proc)
+      source.match(/\(([^)]*)\)/) { |md| md[1] }
+    end
+  end # class Expander
+  State = Struct.new(:current, :current_line, :groups, :target) do
+    def initialize(*)
+      super
+      self.groups ||= []
+      self.target ||= []
+    end
+    def last_group
+      groups.last
+    end
+    def newline?
+      current_line.end_with?("\n")
+    end
+  end # module Directives
+end # class TextExtractor

data/lib/text_extractor/directives/classes.rb ADDED Viewed

@@ -0,0 +1,73 @@
+class TextExtractor
+  class Directives
+    # base class for line directives
+    class Directive
+      attr_reader :state
+      def initialize(state, argument = nil)
+        @state = state
+        @argument = argument
+        init if respond_to?(:init)
+      end
+    end
+    # open a line group
+    class Begin < Directive
+      def init
+        type = case @argument
+               when '', nil
+                 '?:'
+               when '?:'
+                 ''
+               else
+                 @argument
+               end
+        @group = group(type)
+      end
+      def group(*args)
+        Group.new(*args)
+      end
+      def call
+        state.current = nil
+        state.groups.push @group
+      end
+    end
+    # alternating capture group
+    class Any < Begin
+      def group(*args)
+        AnyGroup.new(*args)
+      end
+    end
+    # text that will be omitted from the regexp
+    class Comment < Directive
+      def call
+      end
+    end
+    # close a line group
+    class End < Directive
+      def call
+        state.current = state.groups.pop.finish(state.newline?)
+      end
+    end
+    # current line or group occurs 0 or 1 times
+    class Maybe < Directive
+      def call
+        state.current = ['(?:', state.current, ')?']
+      end
+    end
+    # repetition
+    class Repeat < Directive
+      def call
+        @argument ||= '0,'
+        state.current = ['(?:', state.current, "){#{@argument}}"]
+      end
+    end
+  end
+end

data/lib/text_extractor/directives/group.rb ADDED Viewed

@@ -0,0 +1,46 @@
+class TextExtractor
+  class Directives
+    # a line group
+    class Group
+      def initialize(type, *args)
+        @type = type
+        @lines = args
+      end
+      def <<(item)
+        @lines << item
+      end
+      def to_a
+        @lines
+      end
+      def chomp(newline)
+        return if @lines.empty? || newline
+        tail = @lines[-1]
+        if tail.is_a?(Array)
+          tail = tail[-1] while tail[-1].is_a?(Array)
+          tail[-2] = tail[-2].chomp
+        else
+          @lines[-1] = @lines[-1].chomp
+        end
+      end
+      def finish(newline)
+        chomp(newline)
+        join
+      end
+      def join
+        ["(#{@type}", *@lines, ')']
+      end
+    end
+    # a line group where each line (or subgroup) is an alternative
+    class AnyGroup < Group
+      def join
+        ['(?:', *@lines.flat_map { |e| [e, '|'] }[0..-2], ')']
+      end
+    end
+  end
+end

data/lib/text_extractor/record.rb CHANGED Viewed

@@ -5,6 +5,7 @@ class TextExtractor
     def initialize(regexp, factory: nil, values: [], fill: [])
       @regexp = regexp
       @factory = factory
+      @constructor = FactoryAnalyzer.new(factory).to_proc
       @values = values.map { |val| [val.id, val] }.to_h
       @default_values = values.map { |val| [val.id, nil] }.to_h
       @fill = Array(fill)
@@ -18,37 +19,8 @@ class TextExtractor
     end
     def build_extraction(extracted)
-      case factory
-      when Hash
-        build_extraction_by_hash(extracted)
-      when Set
-        build_extraction_by_set(extracted)
-      when Class
-        build_extraction_by_class(extracted)
-      else
-        extracted
-      end
-    end
-    def build_extraction_by_hash(extracted)
-      klass, params = factory.first
-      klass.new(*extracted.values_at(*params))
-    end
-    def build_extraction_by_set(extracted)
-      klass, params = factory.first
-      values = params.each_with_object({}) do |param, hash|
-        hash[param] = extracted[param]
-      end
-      klass.new(**values)
-    end
-    def build_extraction_by_class(extracted)
-      if factory.ancestors.include?(Struct)
-        factory.new(*extracted.values)
-      else
-        factory.new(**extracted)
-      end
+      return extracted unless @constructor
+      @constructor.call(extracted)
     end
     def match(string, pos = 0)
@@ -70,5 +42,59 @@ class TextExtractor
     def extract_values(match)
       values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
     end
+    # converts the value of the factory option to a constructor proc
+    class FactoryAnalyzer
+      def initialize(factory)
+        case factory
+        when Hash
+          @klass, @params = factory.first
+        else
+          @klass = factory
+        end
+      end
+      def to_proc
+        if @params
+          explicit
+        elsif @klass.is_a?(Proc)
+          @klass
+        elsif @klass
+          implicit
+        end
+      end
+      private
+      def explicit
+        case @params
+        when Array
+          positional
+        when Set
+          keyword
+        end
+      end
+      def positional
+        ->(extracted) { @klass.new(*extracted.values_at(*@params)) }
+      end
+      def keyword
+        lambda do |extracted|
+          values = @params.each_with_object({}) do |param, hash|
+            hash[param] = extracted[param]
+          end
+          @klass.new(**values)
+        end
+      end
+      def implicit
+        if @klass.ancestors.include?(Struct)
+          ->(extracted) { @klass.new(*extracted.values) }
+        else
+          ->(extracted) { @klass.new(**extracted) }
+        end
+      end
+    end # class FactoryAnalyzer
   end # class Record
 end # class TextExtractor

data/lib/text_extractor/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 class TextExtractor
   def self.version
-    '0.1.4'
+    '0.1.5'
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_extractor
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.1.5
 platform: ruby
 authors:
 - Ben Miller
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-03-17 00:00:00.000000000 Z
+date: 2016-03-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -59,6 +59,9 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/text_extractor.rb
+- lib/text_extractor/directives.rb
+- lib/text_extractor/directives/classes.rb
+- lib/text_extractor/directives/group.rb
 - lib/text_extractor/extraction.rb
 - lib/text_extractor/filldown.rb
 - lib/text_extractor/record.rb