text_extractor 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,36 @@
1
+ class TextExtractor
2
+ # represents a single execution of a TextExtractor
3
+ class Extraction
4
+ attr_reader :input, :extractor, :re, :pos, :matches, :values
5
+
6
+ def initialize(input, extractor)
7
+ @input = input
8
+ @extractor = extractor
9
+ @pos = 0
10
+ @matches = []
11
+ @last_match = nil
12
+ end
13
+
14
+ def extraction_matches
15
+ @fill = {}
16
+ matches.flat_map do |match|
17
+ extraction_match(match)
18
+ end
19
+ end
20
+
21
+ def extraction_match(match)
22
+ extractor.find_record_for(match).extraction(match, @fill)
23
+ end
24
+
25
+ def scan
26
+ re = extractor.to_re
27
+ loop do
28
+ match = input.match(re, pos)
29
+ break unless match
30
+ @pos = match.end(0)
31
+ @matches << match
32
+ end
33
+ self
34
+ end
35
+ end # class Extraction
36
+ end # class TextExtractor
@@ -0,0 +1,10 @@
1
+ require_relative "record"
2
+
3
+ class TextExtractor
4
+ class Filldown < Record
5
+ def extraction(match, fill)
6
+ fill.merge!(extract_values(match))
7
+ []
8
+ end
9
+ end # class Filldown < Record
10
+ end # class TextExtractor
@@ -0,0 +1,40 @@
1
+ class TextExtractor
2
+ class Record
3
+ attr_reader :regexp, :factory, :values
4
+
5
+ def initialize(regexp, factory: nil, values: [], fill: [])
6
+ @regexp = regexp
7
+ @factory = factory
8
+ @values = values.map { |val| [val.id, val] }.to_h
9
+ @default_values = values.map { |val| [val.id, nil] }.to_h
10
+ @fill = Array(fill)
11
+ end
12
+
13
+ def extraction(match, fill)
14
+ extracted = {}.merge!(@default_values)
15
+ .merge!(extract_fills fill)
16
+ .merge!(extract_values match)
17
+ factory ? factory.new(*extracted.values) : extracted
18
+ end
19
+
20
+ def match(string, pos = 0)
21
+ @regexp.match(string, pos)
22
+ end
23
+
24
+ def source
25
+ @regexp.source
26
+ end
27
+
28
+ def options
29
+ @regexp.options
30
+ end
31
+
32
+ def extract_fills(fill)
33
+ @fill.zip(fill.values_at(*@fill)).to_h
34
+ end
35
+
36
+ def extract_values(match)
37
+ values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
38
+ end
39
+ end # class Record
40
+ end # class TextExtractor
@@ -0,0 +1,15 @@
1
+ class TextExtractor
2
+ class Value
3
+ attr_reader :id, :re
4
+
5
+ def initialize(id, re, &block)
6
+ @id = id
7
+ @re = re
8
+ @block = block if block_given?
9
+ end
10
+
11
+ def convert(value)
12
+ @block ? @block.call(*value) : value
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,5 @@
1
+ class TextExtractor
2
+ def self.version
3
+ "0.0.2"
4
+ end
5
+ end
@@ -0,0 +1,105 @@
1
+ require_relative "text_extractor/extraction"
2
+ require_relative "text_extractor/filldown"
3
+ require_relative "text_extractor/record"
4
+ require_relative "text_extractor/value"
5
+
6
+ # represents an extractor definition
7
+ class TextExtractor
8
+ attr_reader :records, :values
9
+
10
+ def initialize(&block)
11
+ fail "#{self.class}.new requires a block" unless block
12
+ @values = {}
13
+ @fill = {}
14
+ @values = {}
15
+ @records = []
16
+ @filldowns = []
17
+ @current_record_values = []
18
+ instance_exec(&block)
19
+ end
20
+
21
+ module Patterns
22
+ INTEGER = /\d+/
23
+ FLOAT = /\d+\.?|\d*\.\d+/
24
+ RATIONAL = %r(\d+/\d+)
25
+ IPV4 = /[0-9.]{7,15}/
26
+ IPV6 = /[:a-fA-F0-9\.]{2,45}/
27
+ IPADDR = Regexp.union(IPV4, IPV6)
28
+ IPV4_NET = /#{IPV4}\/\d{1,2}/
29
+ IPV6_NET = /#{IPV6}\/\d{1,3}/
30
+ IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
31
+ TRUE = /y|yes|t|true|on/i
32
+ FALSE = /n|no|f|false|off/i
33
+ BOOLEAN = Regexp.union(TRUE, FALSE)
34
+ end
35
+
36
+ def value(id, re, &block)
37
+ val = @values[id] = Value.new(id, re, &block)
38
+ define_singleton_method(id) do
39
+ @current_record_values << val
40
+ "(?<#{id}>#{re.source})"
41
+ end
42
+ end
43
+
44
+ def boolean(id, re = Patterns::BOOLEAN)
45
+ value(id, re) { |val| !val.match(Patterns::FALSE) }
46
+ end
47
+
48
+ def integer(id, re = Patterns::INTEGER)
49
+ value(id, re) { |val| Integer(val) }
50
+ end
51
+
52
+ def float(id, re = Patterns::FLOAT)
53
+ value(id, re) { |val| Float(val) }
54
+ end
55
+
56
+ def rational(id, re = Patterns::RATIONAL)
57
+ value(id, re) { |val| Rational(val) }
58
+ end
59
+
60
+ def ipaddr(id, re = Patterns::IPADDR)
61
+ value(id, re) { |val| IPAddr.new(val) }
62
+ end
63
+
64
+ def ipnetaddr(id, re = Patterns::IPNETADDR)
65
+ value(id, re) { |val| IPAddr.new(val) }
66
+ end
67
+
68
+ def strip_record(regexp)
69
+ lines = regexp.source.lines
70
+ prefix = lines.last
71
+ lines.map! { |s| s.gsub("#{prefix}", "") } if prefix =~ /\A\s*\z/
72
+ Regexp.new(lines.join.strip, regexp.options)
73
+ end
74
+
75
+ def record(klass = Record, **kwargs, &block)
76
+ fail "#{self.class}.record requires a block" unless block
77
+ @current_record_values = []
78
+ regexp = strip_record(instance_exec(&block))
79
+ kwargs[:values] = @current_record_values
80
+ @records << klass.new(regexp, **kwargs)
81
+ end
82
+
83
+ def filldown(**kwargs, &block)
84
+ fail "#{self.class}.filldown requires a block" unless block
85
+ record(Filldown, **kwargs, &block)
86
+ end
87
+
88
+ def find_record_for(match)
89
+ records[records.length.times.find_index { |i| match["__#{i}"] }]
90
+ end
91
+
92
+ def scan(input)
93
+ Extraction.new(input, self).scan.extraction_matches
94
+ end
95
+
96
+ def regexps
97
+ @records.map.with_index do |record, i|
98
+ Regexp.new("(?<__#{i}>#{record.source})", record.options)
99
+ end
100
+ end
101
+
102
+ def to_re
103
+ Regexp.union(*regexps)
104
+ end
105
+ end # class TextExtractor
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ben Miller
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2015-10-20 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description:
15
+ email: bjmllr@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/text_extractor.rb
21
+ - lib/text_extractor/version.rb
22
+ - lib/text_extractor/extraction.rb
23
+ - lib/text_extractor/value.rb
24
+ - lib/text_extractor/record.rb
25
+ - lib/text_extractor/filldown.rb
26
+ homepage: https://github.com/bjmllr/text_extractor
27
+ licenses:
28
+ - MIT
29
+ post_install_message:
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ none: false
35
+ requirements:
36
+ - - ! '>='
37
+ - !ruby/object:Gem::Version
38
+ version: 2.0.0
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ none: false
41
+ requirements:
42
+ - - ! '>='
43
+ - !ruby/object:Gem::Version
44
+ version: '0'
45
+ requirements: []
46
+ rubyforge_project:
47
+ rubygems_version: 1.8.23
48
+ signing_key:
49
+ specification_version: 3
50
+ summary: Easily extract data from text
51
+ test_files: []