text_extractor 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/text_extractor/extraction.rb +36 -0
- data/lib/text_extractor/filldown.rb +10 -0
- data/lib/text_extractor/record.rb +40 -0
- data/lib/text_extractor/value.rb +15 -0
- data/lib/text_extractor/version.rb +5 -0
- data/lib/text_extractor.rb +105 -0
- metadata +51 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
class TextExtractor
|
2
|
+
# represents a single execution of a TextExtractor
|
3
|
+
class Extraction
|
4
|
+
attr_reader :input, :extractor, :re, :pos, :matches, :values
|
5
|
+
|
6
|
+
def initialize(input, extractor)
|
7
|
+
@input = input
|
8
|
+
@extractor = extractor
|
9
|
+
@pos = 0
|
10
|
+
@matches = []
|
11
|
+
@last_match = nil
|
12
|
+
end
|
13
|
+
|
14
|
+
def extraction_matches
|
15
|
+
@fill = {}
|
16
|
+
matches.flat_map do |match|
|
17
|
+
extraction_match(match)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def extraction_match(match)
|
22
|
+
extractor.find_record_for(match).extraction(match, @fill)
|
23
|
+
end
|
24
|
+
|
25
|
+
def scan
|
26
|
+
re = extractor.to_re
|
27
|
+
loop do
|
28
|
+
match = input.match(re, pos)
|
29
|
+
break unless match
|
30
|
+
@pos = match.end(0)
|
31
|
+
@matches << match
|
32
|
+
end
|
33
|
+
self
|
34
|
+
end
|
35
|
+
end # class Extraction
|
36
|
+
end # class TextExtractor
|
@@ -0,0 +1,40 @@
|
|
1
|
+
class TextExtractor
|
2
|
+
class Record
|
3
|
+
attr_reader :regexp, :factory, :values
|
4
|
+
|
5
|
+
def initialize(regexp, factory: nil, values: [], fill: [])
|
6
|
+
@regexp = regexp
|
7
|
+
@factory = factory
|
8
|
+
@values = values.map { |val| [val.id, val] }.to_h
|
9
|
+
@default_values = values.map { |val| [val.id, nil] }.to_h
|
10
|
+
@fill = Array(fill)
|
11
|
+
end
|
12
|
+
|
13
|
+
def extraction(match, fill)
|
14
|
+
extracted = {}.merge!(@default_values)
|
15
|
+
.merge!(extract_fills fill)
|
16
|
+
.merge!(extract_values match)
|
17
|
+
factory ? factory.new(*extracted.values) : extracted
|
18
|
+
end
|
19
|
+
|
20
|
+
def match(string, pos = 0)
|
21
|
+
@regexp.match(string, pos)
|
22
|
+
end
|
23
|
+
|
24
|
+
def source
|
25
|
+
@regexp.source
|
26
|
+
end
|
27
|
+
|
28
|
+
def options
|
29
|
+
@regexp.options
|
30
|
+
end
|
31
|
+
|
32
|
+
def extract_fills(fill)
|
33
|
+
@fill.zip(fill.values_at(*@fill)).to_h
|
34
|
+
end
|
35
|
+
|
36
|
+
def extract_values(match)
|
37
|
+
values.keys.map { |id| [id, values[id].convert(match[id])] }.to_h
|
38
|
+
end
|
39
|
+
end # class Record
|
40
|
+
end # class TextExtractor
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require_relative "text_extractor/extraction"
|
2
|
+
require_relative "text_extractor/filldown"
|
3
|
+
require_relative "text_extractor/record"
|
4
|
+
require_relative "text_extractor/value"
|
5
|
+
|
6
|
+
# represents an extractor definition
|
7
|
+
class TextExtractor
|
8
|
+
attr_reader :records, :values
|
9
|
+
|
10
|
+
def initialize(&block)
|
11
|
+
fail "#{self.class}.new requires a block" unless block
|
12
|
+
@values = {}
|
13
|
+
@fill = {}
|
14
|
+
@values = {}
|
15
|
+
@records = []
|
16
|
+
@filldowns = []
|
17
|
+
@current_record_values = []
|
18
|
+
instance_exec(&block)
|
19
|
+
end
|
20
|
+
|
21
|
+
module Patterns
|
22
|
+
INTEGER = /\d+/
|
23
|
+
FLOAT = /\d+\.?|\d*\.\d+/
|
24
|
+
RATIONAL = %r(\d+/\d+)
|
25
|
+
IPV4 = /[0-9.]{7,15}/
|
26
|
+
IPV6 = /[:a-fA-F0-9\.]{2,45}/
|
27
|
+
IPADDR = Regexp.union(IPV4, IPV6)
|
28
|
+
IPV4_NET = /#{IPV4}\/\d{1,2}/
|
29
|
+
IPV6_NET = /#{IPV6}\/\d{1,3}/
|
30
|
+
IPNETADDR = Regexp.union(IPV4_NET, IPV6_NET)
|
31
|
+
TRUE = /y|yes|t|true|on/i
|
32
|
+
FALSE = /n|no|f|false|off/i
|
33
|
+
BOOLEAN = Regexp.union(TRUE, FALSE)
|
34
|
+
end
|
35
|
+
|
36
|
+
def value(id, re, &block)
|
37
|
+
val = @values[id] = Value.new(id, re, &block)
|
38
|
+
define_singleton_method(id) do
|
39
|
+
@current_record_values << val
|
40
|
+
"(?<#{id}>#{re.source})"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def boolean(id, re = Patterns::BOOLEAN)
|
45
|
+
value(id, re) { |val| !val.match(Patterns::FALSE) }
|
46
|
+
end
|
47
|
+
|
48
|
+
def integer(id, re = Patterns::INTEGER)
|
49
|
+
value(id, re) { |val| Integer(val) }
|
50
|
+
end
|
51
|
+
|
52
|
+
def float(id, re = Patterns::FLOAT)
|
53
|
+
value(id, re) { |val| Float(val) }
|
54
|
+
end
|
55
|
+
|
56
|
+
def rational(id, re = Patterns::RATIONAL)
|
57
|
+
value(id, re) { |val| Rational(val) }
|
58
|
+
end
|
59
|
+
|
60
|
+
def ipaddr(id, re = Patterns::IPADDR)
|
61
|
+
value(id, re) { |val| IPAddr.new(val) }
|
62
|
+
end
|
63
|
+
|
64
|
+
def ipnetaddr(id, re = Patterns::IPNETADDR)
|
65
|
+
value(id, re) { |val| IPAddr.new(val) }
|
66
|
+
end
|
67
|
+
|
68
|
+
def strip_record(regexp)
|
69
|
+
lines = regexp.source.lines
|
70
|
+
prefix = lines.last
|
71
|
+
lines.map! { |s| s.gsub("#{prefix}", "") } if prefix =~ /\A\s*\z/
|
72
|
+
Regexp.new(lines.join.strip, regexp.options)
|
73
|
+
end
|
74
|
+
|
75
|
+
def record(klass = Record, **kwargs, &block)
|
76
|
+
fail "#{self.class}.record requires a block" unless block
|
77
|
+
@current_record_values = []
|
78
|
+
regexp = strip_record(instance_exec(&block))
|
79
|
+
kwargs[:values] = @current_record_values
|
80
|
+
@records << klass.new(regexp, **kwargs)
|
81
|
+
end
|
82
|
+
|
83
|
+
def filldown(**kwargs, &block)
|
84
|
+
fail "#{self.class}.filldown requires a block" unless block
|
85
|
+
record(Filldown, **kwargs, &block)
|
86
|
+
end
|
87
|
+
|
88
|
+
def find_record_for(match)
|
89
|
+
records[records.length.times.find_index { |i| match["__#{i}"] }]
|
90
|
+
end
|
91
|
+
|
92
|
+
def scan(input)
|
93
|
+
Extraction.new(input, self).scan.extraction_matches
|
94
|
+
end
|
95
|
+
|
96
|
+
def regexps
|
97
|
+
@records.map.with_index do |record, i|
|
98
|
+
Regexp.new("(?<__#{i}>#{record.source})", record.options)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_re
|
103
|
+
Regexp.union(*regexps)
|
104
|
+
end
|
105
|
+
end # class TextExtractor
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_extractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Ben Miller
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2015-10-20 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description:
|
15
|
+
email: bjmllr@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/text_extractor.rb
|
21
|
+
- lib/text_extractor/version.rb
|
22
|
+
- lib/text_extractor/extraction.rb
|
23
|
+
- lib/text_extractor/value.rb
|
24
|
+
- lib/text_extractor/record.rb
|
25
|
+
- lib/text_extractor/filldown.rb
|
26
|
+
homepage: https://github.com/bjmllr/text_extractor
|
27
|
+
licenses:
|
28
|
+
- MIT
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
none: false
|
35
|
+
requirements:
|
36
|
+
- - ! '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: 2.0.0
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
none: false
|
41
|
+
requirements:
|
42
|
+
- - ! '>='
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: '0'
|
45
|
+
requirements: []
|
46
|
+
rubyforge_project:
|
47
|
+
rubygems_version: 1.8.23
|
48
|
+
signing_key:
|
49
|
+
specification_version: 3
|
50
|
+
summary: Easily extract data from text
|
51
|
+
test_files: []
|