RubyGems - text_extractor - Versions diffs - 0.0.2 - Mend

text_extractor 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/lib/text_extractor/extraction.rb +36 -0
data/lib/text_extractor/filldown.rb +10 -0
data/lib/text_extractor/record.rb +40 -0
data/lib/text_extractor/value.rb +15 -0
data/lib/text_extractor/version.rb +5 -0
data/lib/text_extractor.rb +105 -0
metadata +51 -0

data/lib/text_extractor/extraction.rb ADDED Viewed

@@ -0,0 +1,36 @@
+class TextExtractor
+  # represents a single execution of a TextExtractor
+  class Extraction
+    attr_reader :input, :extractor, :re, :pos, :matches, :values
+    def initialize(input, extractor)
+      @input = input
+      @extractor = extractor
+      @pos = 0
+      @matches = []
+      @last_match = nil
+    end
+    def extraction_matches
+      @fill = {}
+      matches.flat_map do |match|
+        extraction_match(match)
+      end
+    end
+    def extraction_match(match)
+      extractor.find_record_for(match).extraction(match, @fill)
+    end
+    def scan
+      re = extractor.to_re
+      loop do
+        match = input.match(re, pos)
+        break unless match
+        @pos = match.end(0)
+        @matches << match
+      end
+      self
+    end
+  end # class Extraction
+end # class TextExtractor

data/lib/text_extractor/filldown.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require_relative "record"
+class TextExtractor
+  class Filldown < Record
+    def extraction(match, fill)
+      fill.merge!(extract_values(match))
+      []
+    end
+  end # class Filldown < Record
+end # class TextExtractor

data/lib/text_extractor/record.rb ADDED Viewed

@@ -0,0 +1,40 @@
+class TextExtractor
+  class Record
+    attr_reader :regexp, :factory, :values
+    def initialize(regexp, factory: nil, values: [], fill: [])
+      @regexp = regexp
+      @factory = factory
+      @values = values.map { |val| [val.id, val] }.to_h
+      @default_values = values.map { |val| [val.id, nil] }.to_h
+      @fill = Array(fill)
+    end
+    def extraction(match, fill)
+      extracted = {}.merge!(@default_values)
+                  .merge!(extract_fills fill)
+                  .merge!(extract_values match)
+      factory ? factory.new(*extracted.values) : extracted
+    end
+    def match(string, pos = 0)
+      @regexp.match(string, pos)
+    end
+    def source
+      @regexp.source
+    end
+    def options
+      @regexp.options
+    end
+    def extract_fills(fill)
+      @fill.zip(fill.values_at(*@fill)).to_h
+    end
+    def extract_values(match)
+      values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
+    end
+  end # class Record
+end # class TextExtractor

data/lib/text_extractor/value.rb ADDED Viewed

@@ -0,0 +1,15 @@
+class TextExtractor
+  class Value
+    attr_reader :id, :re
+    def initialize(id, re, &block)
+      @id = id
+      @re = re
+      @block = block if block_given?
+    end
+    def convert(value)
+      @block ? @block.call(*value) : value
+    end
+  end
+end

data/lib/text_extractor/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class TextExtractor
+  def self.version
+    "0.0.2"
+  end
+end

data/lib/text_extractor.rb ADDED Viewed

@@ -0,0 +1,105 @@
+require_relative "text_extractor/extraction"
+require_relative "text_extractor/filldown"
+require_relative "text_extractor/record"
+require_relative "text_extractor/value"
+# represents an extractor definition
+class TextExtractor
+  attr_reader :records, :values
+  def initialize(&block)
+    fail "#{self.class}.new requires a block" unless block
+    @values = {}
+    @fill = {}
+    @values = {}
+    @records = []
+    @filldowns = []
+    @current_record_values = []
+    instance_exec(&block)
+  end
+  module Patterns
+    INTEGER = /\d+/
+    FLOAT = /\d+\.?|\d*\.\d+/
+    RATIONAL = %r(\d+/\d+)
+    IPV4 = /[0-9.]{7,15}/
+    IPV6 = /[:a-fA-F0-9\.]{2,45}/
+    IPADDR = Regexp.union(IPV4, IPV6)
+    IPV4_NET = /#{IPV4}\/\d{1,2}/
+    IPV6_NET = /#{IPV6}\/\d{1,3}/
+    IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
+    TRUE = /y|yes|t|true|on/i
+    FALSE = /n|no|f|false|off/i
+    BOOLEAN = Regexp.union(TRUE, FALSE)
+  end
+  def value(id, re, &block)
+    val = @values[id] = Value.new(id, re, &block)
+    define_singleton_method(id) do
+      @current_record_values << val
+      "(?<#{id}>#{re.source})"
+    end
+  end
+  def boolean(id, re = Patterns::BOOLEAN)
+    value(id, re) { |val| !val.match(Patterns::FALSE) }
+  end
+  def integer(id, re = Patterns::INTEGER)
+    value(id, re) { |val| Integer(val) }
+  end
+  def float(id, re = Patterns::FLOAT)
+    value(id, re) { |val| Float(val) }
+  end
+  def rational(id, re = Patterns::RATIONAL)
+    value(id, re) { |val| Rational(val) }
+  end
+  def ipaddr(id, re = Patterns::IPADDR)
+    value(id, re) { |val| IPAddr.new(val) }
+  end
+  def ipnetaddr(id, re = Patterns::IPNETADDR)
+    value(id, re) { |val| IPAddr.new(val) }
+  end
+  def strip_record(regexp)
+    lines = regexp.source.lines
+    prefix = lines.last
+    lines.map! { |s| s.gsub("#{prefix}", "") } if prefix =~ /\A\s*\z/
+    Regexp.new(lines.join.strip, regexp.options)
+  end
+  def record(klass = Record, **kwargs, &block)
+    fail "#{self.class}.record requires a block" unless block
+    @current_record_values = []
+    regexp = strip_record(instance_exec(&block))
+    kwargs[:values] = @current_record_values
+    @records << klass.new(regexp, **kwargs)
+  end
+  def filldown(**kwargs, &block)
+    fail "#{self.class}.filldown requires a block" unless block
+    record(Filldown, **kwargs, &block)
+  end
+  def find_record_for(match)
+    records[records.length.times.find_index { |i| match["__#{i}"] }]
+  end
+  def scan(input)
+    Extraction.new(input, self).scan.extraction_matches
+  end
+  def regexps
+    @records.map.with_index do |record, i|
+      Regexp.new("(?<__#{i}>#{record.source})", record.options)
+    end
+  end
+  def to_re
+    Regexp.union(*regexps)
+  end
+end # class TextExtractor

metadata ADDED Viewed

@@ -0,0 +1,51 @@
+--- !ruby/object:Gem::Specification
+name: text_extractor
+version: !ruby/object:Gem::Version
+  version: 0.0.2
+  prerelease:
+platform: ruby
+authors:
+- Ben Miller
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-10-20 00:00:00.000000000 Z
+dependencies: []
+description:
+email: bjmllr@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/text_extractor.rb
+- lib/text_extractor/version.rb
+- lib/text_extractor/extraction.rb
+- lib/text_extractor/value.rb
+- lib/text_extractor/record.rb
+- lib/text_extractor/filldown.rb
+homepage: https://github.com/bjmllr/text_extractor
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: 2.0.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: Easily extract data from text
+test_files: []