textutils 1.3.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Manifest.txt +63 -61
- data/Rakefile +1 -1
- data/lib/textutils.rb +1 -0
- data/lib/textutils/title_mapper2.rb +168 -0
- data/lib/textutils/version.rb +2 -2
- data/test/test_title_mapper2.rb +45 -0
- metadata +10 -23
- data/.gemtest +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0db02f95a1da720a3778b0e1dc3636f7318e6fa6
|
4
|
+
data.tar.gz: c3420d50bc0c25e3d7e6ced6b9959c01770d471b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 690e28c6bb15160be57234c3092ef2db54eabfa5d7b66f6ed58dbcd6d5b876451654833c9c0334d55921571fb8498942ae02e8475d437dbd3944ba78afd340ab
|
7
|
+
data.tar.gz: 7348393e33bd3813c55c5b1360ec524836aa1cffb545e53597b599d89f009505c64560df2f705fac44c687644cfcc404aa2a653de8fdc8c6b054f0ae192c144f
|
data/Manifest.txt
CHANGED
@@ -1,61 +1,63 @@
|
|
1
|
-
HISTORY.md
|
2
|
-
Manifest.txt
|
3
|
-
README.md
|
4
|
-
Rakefile
|
5
|
-
lib/textutils.rb
|
6
|
-
lib/textutils/classifier.rb
|
7
|
-
lib/textutils/core_ext/array.rb
|
8
|
-
lib/textutils/core_ext/file.rb
|
9
|
-
lib/textutils/core_ext/time.rb
|
10
|
-
lib/textutils/filter/code_filter.rb
|
11
|
-
lib/textutils/filter/comment_filter.rb
|
12
|
-
lib/textutils/filter/erb_django_filter.rb
|
13
|
-
lib/textutils/filter/erb_filter.rb
|
14
|
-
lib/textutils/filter/string_filter.rb
|
15
|
-
lib/textutils/helper/address_helper.rb
|
16
|
-
lib/textutils/helper/date_helper.rb
|
17
|
-
lib/textutils/helper/hypertext_helper.rb
|
18
|
-
lib/textutils/helper/tag_helper.rb
|
19
|
-
lib/textutils/helper/title_helper.rb
|
20
|
-
lib/textutils/helper/unicode_helper.rb
|
21
|
-
lib/textutils/helper/value_helper_i.rb
|
22
|
-
lib/textutils/helper/value_helper_ii.rb
|
23
|
-
lib/textutils/helper/value_helper_iii_numbers.rb
|
24
|
-
lib/textutils/helper/xml_helper.rb
|
25
|
-
lib/textutils/page.rb
|
26
|
-
lib/textutils/parser/name_parser.rb
|
27
|
-
lib/textutils/parser/name_tokenizer.rb
|
28
|
-
lib/textutils/patterns.rb
|
29
|
-
lib/textutils/reader/block_reader.rb
|
30
|
-
lib/textutils/reader/code_reader.rb
|
31
|
-
lib/textutils/reader/fixture_reader.rb
|
32
|
-
lib/textutils/reader/hash_reader.rb
|
33
|
-
lib/textutils/reader/line_reader.rb
|
34
|
-
lib/textutils/reader/tree_reader.rb
|
35
|
-
lib/textutils/reader/values_reader.rb
|
36
|
-
lib/textutils/sanitizier.rb
|
37
|
-
lib/textutils/title.rb
|
38
|
-
lib/textutils/title_mapper.rb
|
39
|
-
lib/textutils/
|
40
|
-
lib/textutils/
|
41
|
-
|
42
|
-
test/data/
|
43
|
-
test/data/
|
44
|
-
test/data/de-deutschland/3--by-bayern/4--oberfranken/
|
45
|
-
test/data/de-deutschland/
|
46
|
-
test/data/
|
47
|
-
test/
|
48
|
-
test/
|
49
|
-
test/
|
50
|
-
test/
|
51
|
-
test/
|
52
|
-
test/
|
53
|
-
test/
|
54
|
-
test/
|
55
|
-
test/
|
56
|
-
test/
|
57
|
-
test/
|
58
|
-
test/
|
59
|
-
test/
|
60
|
-
test/
|
61
|
-
test/
|
1
|
+
HISTORY.md
|
2
|
+
Manifest.txt
|
3
|
+
README.md
|
4
|
+
Rakefile
|
5
|
+
lib/textutils.rb
|
6
|
+
lib/textutils/classifier.rb
|
7
|
+
lib/textutils/core_ext/array.rb
|
8
|
+
lib/textutils/core_ext/file.rb
|
9
|
+
lib/textutils/core_ext/time.rb
|
10
|
+
lib/textutils/filter/code_filter.rb
|
11
|
+
lib/textutils/filter/comment_filter.rb
|
12
|
+
lib/textutils/filter/erb_django_filter.rb
|
13
|
+
lib/textutils/filter/erb_filter.rb
|
14
|
+
lib/textutils/filter/string_filter.rb
|
15
|
+
lib/textutils/helper/address_helper.rb
|
16
|
+
lib/textutils/helper/date_helper.rb
|
17
|
+
lib/textutils/helper/hypertext_helper.rb
|
18
|
+
lib/textutils/helper/tag_helper.rb
|
19
|
+
lib/textutils/helper/title_helper.rb
|
20
|
+
lib/textutils/helper/unicode_helper.rb
|
21
|
+
lib/textutils/helper/value_helper_i.rb
|
22
|
+
lib/textutils/helper/value_helper_ii.rb
|
23
|
+
lib/textutils/helper/value_helper_iii_numbers.rb
|
24
|
+
lib/textutils/helper/xml_helper.rb
|
25
|
+
lib/textutils/page.rb
|
26
|
+
lib/textutils/parser/name_parser.rb
|
27
|
+
lib/textutils/parser/name_tokenizer.rb
|
28
|
+
lib/textutils/patterns.rb
|
29
|
+
lib/textutils/reader/block_reader.rb
|
30
|
+
lib/textutils/reader/code_reader.rb
|
31
|
+
lib/textutils/reader/fixture_reader.rb
|
32
|
+
lib/textutils/reader/hash_reader.rb
|
33
|
+
lib/textutils/reader/line_reader.rb
|
34
|
+
lib/textutils/reader/tree_reader.rb
|
35
|
+
lib/textutils/reader/values_reader.rb
|
36
|
+
lib/textutils/sanitizier.rb
|
37
|
+
lib/textutils/title.rb
|
38
|
+
lib/textutils/title_mapper.rb
|
39
|
+
lib/textutils/title_mapper2.rb
|
40
|
+
lib/textutils/utils.rb
|
41
|
+
lib/textutils/version.rb
|
42
|
+
test/data/at-austria/1--n-niederoesterreich/orte.txt
|
43
|
+
test/data/cl_all.txt
|
44
|
+
test/data/de-deutschland/3--by-bayern/4--oberfranken/orte.txt
|
45
|
+
test/data/de-deutschland/3--by-bayern/4--oberfranken/orte_ii.txt
|
46
|
+
test/data/de-deutschland/orte.txt
|
47
|
+
test/data/feedburner.txt
|
48
|
+
test/helper.rb
|
49
|
+
test/test_address_helper.rb
|
50
|
+
test/test_asciify.rb
|
51
|
+
test/test_block_reader.rb
|
52
|
+
test/test_fixture_reader.rb
|
53
|
+
test/test_hypertext_helper.rb
|
54
|
+
test/test_slugify.rb
|
55
|
+
test/test_taglist.rb
|
56
|
+
test/test_title_finder.rb
|
57
|
+
test/test_title_helper.rb
|
58
|
+
test/test_title_mapper.rb
|
59
|
+
test/test_title_mapper2.rb
|
60
|
+
test/test_tree_reader.rb
|
61
|
+
test/test_tree_reader_ii.rb
|
62
|
+
test/test_unicode_helper.rb
|
63
|
+
test/test_values_reader.rb
|
data/Rakefile
CHANGED
@@ -21,7 +21,7 @@ Hoe.spec 'textutils' do
|
|
21
21
|
['props', '>=1.1.2'],
|
22
22
|
['logutils', '>=0.6.1'],
|
23
23
|
### 3rd party gems
|
24
|
-
['rubyzip'], ## todo/check: make optional -why? why not??
|
24
|
+
['rubyzip', '>=1.0.0'], ## note: 1.0 changed to require zip (pre 1.0 was zip/zip); todo/check: make optional -why? why not??
|
25
25
|
['activesupport'] ## todo/check: really needed? document what methods get used
|
26
26
|
]
|
27
27
|
|
data/lib/textutils.rb
CHANGED
@@ -66,6 +66,7 @@ require 'textutils/reader/tree_reader'
|
|
66
66
|
require 'textutils/classifier'
|
67
67
|
require 'textutils/title' # title table/mapper/finder utils
|
68
68
|
require 'textutils/title_mapper'
|
69
|
+
require 'textutils/title_mapper2'
|
69
70
|
|
70
71
|
require 'textutils/page' # for book pages and page templates
|
71
72
|
|
@@ -0,0 +1,168 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
## see textutils/title.rb
|
5
|
+
## for existing code
|
6
|
+
## move over here
|
7
|
+
|
8
|
+
|
9
|
+
module TextUtils
|
10
|
+
|
11
|
+
class TitleMapper2 ## todo/check: rename to NameMapper ? why? why not??
|
12
|
+
|
13
|
+
include LogUtils::Logging
|
14
|
+
|
15
|
+
attr_reader :known_titles ## rename to mapping or mappings or just titles - why? why not?
|
16
|
+
|
17
|
+
##
|
18
|
+
## key: e.g. augsburg
|
19
|
+
## title: e.g. FC Augsburg
|
20
|
+
## length (of title - not pattern): e.g. 11 -- do not count dots (e.g. U.S.A. => 3 or 6) why? why not?
|
21
|
+
MappingStruct = Struct.new( :key, :title, :length, :pattern) ## todo/check: use (rename to) TitleStruct - why? why not??
|
22
|
+
|
23
|
+
|
24
|
+
def initialize( records, tag )
|
25
|
+
@known_titles = build_title_table_for( records ) ## build mapping lookup table
|
26
|
+
|
27
|
+
## todo: rename tag to attrib or attrib_name - why ?? why not ???
|
28
|
+
@tag = tag # e.g. tag name use for @@brewery@@ @@team@@ etc.
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
def map_titles!( line ) ## rename to just map! - why?? why not???
|
33
|
+
begin
|
34
|
+
found = map_title_for!( @tag, line, @known_titles )
|
35
|
+
end while found
|
36
|
+
end
|
37
|
+
|
38
|
+
def find_key!( line )
|
39
|
+
find_key_for!( @tag, line )
|
40
|
+
end
|
41
|
+
|
42
|
+
def find_keys!( line ) # NB: keys (plural!) - will return array
|
43
|
+
counter = 1
|
44
|
+
keys = []
|
45
|
+
|
46
|
+
key = find_key_for!( "#{@tag}#{counter}", line )
|
47
|
+
while key.present?
|
48
|
+
keys << key
|
49
|
+
counter += 1
|
50
|
+
key = find_key_for!( "#{@tag}#{counter}", line )
|
51
|
+
end
|
52
|
+
keys
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
private
|
57
|
+
def build_title_table_for( records )
|
58
|
+
|
59
|
+
## build known tracks table w/ synonyms e.g.
|
60
|
+
#
|
61
|
+
# [[ 'wolfsbrug', 'VfL Wolfsburg'],
|
62
|
+
# [ 'augsburg', 'FC Augsburg'],
|
63
|
+
# [ 'augsburg', 'Augi2'],
|
64
|
+
# [ 'augsburg', 'Augi3' ],
|
65
|
+
# [ 'stuttgart', 'VfB Stuttgart']]
|
66
|
+
|
67
|
+
known_titles = []
|
68
|
+
|
69
|
+
records.each_with_index do |rec,index|
|
70
|
+
|
71
|
+
title_candidates = []
|
72
|
+
title_candidates << rec.title
|
73
|
+
|
74
|
+
title_candidates += rec.synonyms.split('|') if rec.synonyms.present?
|
75
|
+
|
76
|
+
|
77
|
+
## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
|
78
|
+
# make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
|
79
|
+
|
80
|
+
titles = []
|
81
|
+
title_candidates.each do |t|
|
82
|
+
titles << t
|
83
|
+
if t =~ /\(.+\)/
|
84
|
+
extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
|
85
|
+
# note: strip leading n trailing withspaces too!
|
86
|
+
# -- todo: add squish or something if () is inline e.g. leaves two spaces?
|
87
|
+
extra_title.strip!
|
88
|
+
titles << extra_title
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
titles.each do |t|
|
93
|
+
m = MappingStruct.new
|
94
|
+
m.key = rec.key
|
95
|
+
m.title = t
|
96
|
+
m.length = t.length
|
97
|
+
## note: escape for regex plus allow subs for special chars/accents
|
98
|
+
m.pattern = TextUtils.title_esc_regex( t )
|
99
|
+
|
100
|
+
known_titles << m
|
101
|
+
end
|
102
|
+
|
103
|
+
logger.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
|
104
|
+
|
105
|
+
## NB: only include code field - if defined
|
106
|
+
if rec.respond_to?(:code) && rec.code.present?
|
107
|
+
m = MappingStruct.new
|
108
|
+
m.key = rec.key
|
109
|
+
m.title = rec.code
|
110
|
+
m.length = rec.code.length
|
111
|
+
m.pattern = rec.code ## note: use code for now as is (no variants allowed fow now)
|
112
|
+
|
113
|
+
known_titles << m
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
## note: sort here by length (largest goes first - best match)
|
118
|
+
# exclude code and key (key should always go last)
|
119
|
+
known_titles = known_titles.sort { |left,right| right.length <=> left.length }
|
120
|
+
known_titles
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
def map_title_for!( tag, line, mappings )
|
125
|
+
|
126
|
+
downcase_tag = tag.downcase
|
127
|
+
|
128
|
+
mappings.each do |mapping|
|
129
|
+
|
130
|
+
key = mapping.key
|
131
|
+
value = mapping.pattern
|
132
|
+
## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
|
133
|
+
## (thus add it, allows match for Benfica Lis. for example - note . at the end)
|
134
|
+
|
135
|
+
## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
|
136
|
+
regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
|
137
|
+
if line =~ regex
|
138
|
+
logger.debug " match for #{downcase_tag} >#{key}< >#{value}<"
|
139
|
+
# make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
|
140
|
+
line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end
|
141
|
+
return true # break out after first match (do NOT continue)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
return false
|
145
|
+
end
|
146
|
+
|
147
|
+
|
148
|
+
def find_key_for!( tag, line )
|
149
|
+
regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
|
150
|
+
|
151
|
+
upcase_tag = tag.upcase
|
152
|
+
downcase_tag = tag.downcase
|
153
|
+
|
154
|
+
if line =~ regex
|
155
|
+
value = "#{$1}"
|
156
|
+
logger.debug " #{downcase_tag}: >#{value}<"
|
157
|
+
|
158
|
+
line.sub!( regex, "[#{upcase_tag}]" )
|
159
|
+
|
160
|
+
return $1
|
161
|
+
else
|
162
|
+
return nil
|
163
|
+
end
|
164
|
+
end # method find_key_for!
|
165
|
+
|
166
|
+
|
167
|
+
end # class TitleMapper2
|
168
|
+
end # module TextUtils
|
data/lib/textutils/version.rb
CHANGED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_title_mapper2.rb
|
6
|
+
|
7
|
+
|
8
|
+
require 'helper'
|
9
|
+
|
10
|
+
|
11
|
+
class TestTitleMapper2 < Minitest::Test
|
12
|
+
|
13
|
+
ClubStruct = Struct.new(:key, :title, :synonyms)
|
14
|
+
|
15
|
+
def test_title_table
|
16
|
+
|
17
|
+
titles_in = [
|
18
|
+
ClubStruct.new( 'barcelona', 'Barcelona', 'FC Barcelona' ),
|
19
|
+
ClubStruct.new( 'espanyol', 'Espanyol', 'RCD Espanyol|Espanyol Barcelona' ),
|
20
|
+
ClubStruct.new( 'sevilla', 'Sevilla', 'Sevilla FC' )
|
21
|
+
]
|
22
|
+
|
23
|
+
mapper = TextUtils::TitleMapper2.new( titles_in, 'club' )
|
24
|
+
titles_out = mapper.known_titles
|
25
|
+
|
26
|
+
puts 'titles_out:'
|
27
|
+
pp titles_out
|
28
|
+
|
29
|
+
line = "Espanyol Barcelona 1-0 FC Barcelona"
|
30
|
+
mapper.map_titles!( line )
|
31
|
+
puts "=> #{line}"
|
32
|
+
|
33
|
+
club1 = mapper.find_key!( line )
|
34
|
+
club2 = mapper.find_key!( line )
|
35
|
+
puts "=> #{line}"
|
36
|
+
|
37
|
+
assert_equal 'espanyol', club1
|
38
|
+
assert_equal 'barcelona', club2
|
39
|
+
|
40
|
+
assert true ## assume everything ok if we get here
|
41
|
+
|
42
|
+
end # method test_title_table
|
43
|
+
|
44
|
+
|
45
|
+
end # class TestTitleMapper2
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: props
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 1.0.0
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 1.0.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: activesupport
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,14 +86,14 @@ dependencies:
|
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '3.
|
89
|
+
version: '3.14'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '3.
|
96
|
+
version: '3.14'
|
97
97
|
description: textutils - Text Filters, Helpers, Readers and More
|
98
98
|
email: ruby-talk@ruby-lang.org
|
99
99
|
executables: []
|
@@ -103,7 +103,6 @@ extra_rdoc_files:
|
|
103
103
|
- Manifest.txt
|
104
104
|
- README.md
|
105
105
|
files:
|
106
|
-
- ".gemtest"
|
107
106
|
- HISTORY.md
|
108
107
|
- Manifest.txt
|
109
108
|
- README.md
|
@@ -142,6 +141,7 @@ files:
|
|
142
141
|
- lib/textutils/sanitizier.rb
|
143
142
|
- lib/textutils/title.rb
|
144
143
|
- lib/textutils/title_mapper.rb
|
144
|
+
- lib/textutils/title_mapper2.rb
|
145
145
|
- lib/textutils/utils.rb
|
146
146
|
- lib/textutils/version.rb
|
147
147
|
- test/data/at-austria/1--n-niederoesterreich/orte.txt
|
@@ -161,6 +161,7 @@ files:
|
|
161
161
|
- test/test_title_finder.rb
|
162
162
|
- test/test_title_helper.rb
|
163
163
|
- test/test_title_mapper.rb
|
164
|
+
- test/test_title_mapper2.rb
|
164
165
|
- test/test_tree_reader.rb
|
165
166
|
- test/test_tree_reader_ii.rb
|
166
167
|
- test/test_unicode_helper.rb
|
@@ -187,22 +188,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
187
188
|
version: '0'
|
188
189
|
requirements: []
|
189
190
|
rubyforge_project:
|
190
|
-
rubygems_version: 2.
|
191
|
+
rubygems_version: 2.2.3
|
191
192
|
signing_key:
|
192
193
|
specification_version: 4
|
193
194
|
summary: textutils - Text Filters, Helpers, Readers and More
|
194
|
-
test_files:
|
195
|
-
- test/test_title_finder.rb
|
196
|
-
- test/test_fixture_reader.rb
|
197
|
-
- test/test_unicode_helper.rb
|
198
|
-
- test/test_asciify.rb
|
199
|
-
- test/test_tree_reader.rb
|
200
|
-
- test/test_title_mapper.rb
|
201
|
-
- test/test_values_reader.rb
|
202
|
-
- test/test_taglist.rb
|
203
|
-
- test/test_hypertext_helper.rb
|
204
|
-
- test/test_title_helper.rb
|
205
|
-
- test/test_slugify.rb
|
206
|
-
- test/test_address_helper.rb
|
207
|
-
- test/test_block_reader.rb
|
208
|
-
- test/test_tree_reader_ii.rb
|
195
|
+
test_files: []
|
data/.gemtest
DELETED
File without changes
|