text_extractor 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ class TextExtractor
2
+ # represents a single execution of a TextExtractor
3
+ class Extraction
4
+ attr_reader :input, :extractor, :re, :pos, :matches, :values
5
+
6
+ def initialize(input, extractor)
7
+ @input = input
8
+ @extractor = extractor
9
+ @pos = 0
10
+ @matches = []
11
+ @last_match = nil
12
+ end
13
+
14
+ def extraction_matches
15
+ @fill = {}
16
+ matches.flat_map do |match|
17
+ extraction_match(match)
18
+ end
19
+ end
20
+
21
+ def extraction_match(match)
22
+ extractor.find_record_for(match).extraction(match, @fill)
23
+ end
24
+
25
+ def scan
26
+ re = extractor.to_re
27
+ loop do
28
+ match = input.match(re, pos)
29
+ break unless match
30
+ @pos = match.end(0)
31
+ @matches << match
32
+ end
33
+ self
34
+ end
35
+ end # class Extraction
36
+ end # class TextExtractor
@@ -0,0 +1,10 @@
1
+ require_relative "record"
2
+
3
+ class TextExtractor
4
+ class Filldown < Record
5
+ def extraction(match, fill)
6
+ fill.merge!(extract_values(match))
7
+ []
8
+ end
9
+ end # class Filldown < Record
10
+ end # class TextExtractor
@@ -0,0 +1,40 @@
1
+ class TextExtractor
2
+ class Record
3
+ attr_reader :regexp, :factory, :values
4
+
5
+ def initialize(regexp, factory: nil, values: [], fill: [])
6
+ @regexp = regexp
7
+ @factory = factory
8
+ @values = values.map { |val| [val.id, val] }.to_h
9
+ @default_values = values.map { |val| [val.id, nil] }.to_h
10
+ @fill = Array(fill)
11
+ end
12
+
13
+ def extraction(match, fill)
14
+ extracted = {}.merge!(@default_values)
15
+ .merge!(extract_fills fill)
16
+ .merge!(extract_values match)
17
+ factory ? factory.new(*extracted.values) : extracted
18
+ end
19
+
20
+ def match(string, pos = 0)
21
+ @regexp.match(string, pos)
22
+ end
23
+
24
+ def source
25
+ @regexp.source
26
+ end
27
+
28
+ def options
29
+ @regexp.options
30
+ end
31
+
32
+ def extract_fills(fill)
33
+ @fill.zip(fill.values_at(*@fill)).to_h
34
+ end
35
+
36
+ def extract_values(match)
37
+ values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
38
+ end
39
+ end # class Record
40
+ end # class TextExtractor
@@ -0,0 +1,15 @@
1
+ class TextExtractor
2
+ class Value
3
+ attr_reader :id, :re
4
+
5
+ def initialize(id, re, &block)
6
+ @id = id
7
+ @re = re
8
+ @block = block if block_given?
9
+ end
10
+
11
+ def convert(value)
12
+ @block ? @block.call(*value) : value
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,5 @@
1
+ class TextExtractor
2
+ def self.version
3
+ "0.0.2"
4
+ end
5
+ end
@@ -0,0 +1,105 @@
1
+ require_relative "text_extractor/extraction"
2
+ require_relative "text_extractor/filldown"
3
+ require_relative "text_extractor/record"
4
+ require_relative "text_extractor/value"
5
+
6
+ # represents an extractor definition
7
+ class TextExtractor
8
+ attr_reader :records, :values
9
+
10
+ def initialize(&block)
11
+ fail "#{self.class}.new requires a block" unless block
12
+ @values = {}
13
+ @fill = {}
14
+ @values = {}
15
+ @records = []
16
+ @filldowns = []
17
+ @current_record_values = []
18
+ instance_exec(&block)
19
+ end
20
+
21
+ module Patterns
22
+ INTEGER = /\d+/
23
+ FLOAT = /\d+\.?|\d*\.\d+/
24
+ RATIONAL = %r(\d+/\d+)
25
+ IPV4 = /[0-9.]{7,15}/
26
+ IPV6 = /[:a-fA-F0-9\.]{2,45}/
27
+ IPADDR = Regexp.union(IPV4, IPV6)
28
+ IPV4_NET = /#{IPV4}\/\d{1,2}/
29
+ IPV6_NET = /#{IPV6}\/\d{1,3}/
30
+ IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
31
+ TRUE = /y|yes|t|true|on/i
32
+ FALSE = /n|no|f|false|off/i
33
+ BOOLEAN = Regexp.union(TRUE, FALSE)
34
+ end
35
+
36
+ def value(id, re, &block)
37
+ val = @values[id] = Value.new(id, re, &block)
38
+ define_singleton_method(id) do
39
+ @current_record_values << val
40
+ "(?<#{id}>#{re.source})"
41
+ end
42
+ end
43
+
44
+ def boolean(id, re = Patterns::BOOLEAN)
45
+ value(id, re) { |val| !val.match(Patterns::FALSE) }
46
+ end
47
+
48
+ def integer(id, re = Patterns::INTEGER)
49
+ value(id, re) { |val| Integer(val) }
50
+ end
51
+
52
+ def float(id, re = Patterns::FLOAT)
53
+ value(id, re) { |val| Float(val) }
54
+ end
55
+
56
+ def rational(id, re = Patterns::RATIONAL)
57
+ value(id, re) { |val| Rational(val) }
58
+ end
59
+
60
+ def ipaddr(id, re = Patterns::IPADDR)
61
+ value(id, re) { |val| IPAddr.new(val) }
62
+ end
63
+
64
+ def ipnetaddr(id, re = Patterns::IPNETADDR)
65
+ value(id, re) { |val| IPAddr.new(val) }
66
+ end
67
+
68
+ def strip_record(regexp)
69
+ lines = regexp.source.lines
70
+ prefix = lines.last
71
+ lines.map! { |s| s.gsub("#{prefix}", "") } if prefix =~ /\A\s*\z/
72
+ Regexp.new(lines.join.strip, regexp.options)
73
+ end
74
+
75
+ def record(klass = Record, **kwargs, &block)
76
+ fail "#{self.class}.record requires a block" unless block
77
+ @current_record_values = []
78
+ regexp = strip_record(instance_exec(&block))
79
+ kwargs[:values] = @current_record_values
80
+ @records << klass.new(regexp, **kwargs)
81
+ end
82
+
83
+ def filldown(**kwargs, &block)
84
+ fail "#{self.class}.filldown requires a block" unless block
85
+ record(Filldown, **kwargs, &block)
86
+ end
87
+
88
+ def find_record_for(match)
89
+ records[records.length.times.find_index { |i| match["__#{i}"] }]
90
+ end
91
+
92
+ def scan(input)
93
+ Extraction.new(input, self).scan.extraction_matches
94
+ end
95
+
96
+ def regexps
97
+ @records.map.with_index do |record, i|
98
+ Regexp.new("(?<__#{i}>#{record.source})", record.options)
99
+ end
100
+ end
101
+
102
+ def to_re
103
+ Regexp.union(*regexps)
104
+ end
105
+ end # class TextExtractor
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ben Miller
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-10-20 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description:
15
+ email: bjmllr@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/text_extractor.rb
21
+ - lib/text_extractor/version.rb
22
+ - lib/text_extractor/extraction.rb
23
+ - lib/text_extractor/value.rb
24
+ - lib/text_extractor/record.rb
25
+ - lib/text_extractor/filldown.rb
26
+ homepage: https://github.com/bjmllr/text_extractor
27
+ licenses:
28
+ - MIT
29
+ post_install_message:
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: 2.0.0
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ requirements: []
46
+ rubyforge_project:
47
+ rubygems_version: 1.8.23
48
+ signing_key:
49
+ specification_version: 3
50
+ summary: Easily extract data from text
51
+ test_files: []