textutils 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Manifest.txt +63 -61
- data/Rakefile +1 -1
- data/lib/textutils.rb +1 -0
- data/lib/textutils/title_mapper2.rb +168 -0
- data/lib/textutils/version.rb +2 -2
- data/test/test_title_mapper2.rb +45 -0
- metadata +10 -23
- data/.gemtest +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0db02f95a1da720a3778b0e1dc3636f7318e6fa6
|
4
|
+
data.tar.gz: c3420d50bc0c25e3d7e6ced6b9959c01770d471b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 690e28c6bb15160be57234c3092ef2db54eabfa5d7b66f6ed58dbcd6d5b876451654833c9c0334d55921571fb8498942ae02e8475d437dbd3944ba78afd340ab
|
7
|
+
data.tar.gz: 7348393e33bd3813c55c5b1360ec524836aa1cffb545e53597b599d89f009505c64560df2f705fac44c687644cfcc404aa2a653de8fdc8c6b054f0ae192c144f
|
data/Manifest.txt
CHANGED
@@ -1,61 +1,63 @@
|
|
1
|
-
HISTORY.md
|
2
|
-
Manifest.txt
|
3
|
-
README.md
|
4
|
-
Rakefile
|
5
|
-
lib/textutils.rb
|
6
|
-
lib/textutils/classifier.rb
|
7
|
-
lib/textutils/core_ext/array.rb
|
8
|
-
lib/textutils/core_ext/file.rb
|
9
|
-
lib/textutils/core_ext/time.rb
|
10
|
-
lib/textutils/filter/code_filter.rb
|
11
|
-
lib/textutils/filter/comment_filter.rb
|
12
|
-
lib/textutils/filter/erb_django_filter.rb
|
13
|
-
lib/textutils/filter/erb_filter.rb
|
14
|
-
lib/textutils/filter/string_filter.rb
|
15
|
-
lib/textutils/helper/address_helper.rb
|
16
|
-
lib/textutils/helper/date_helper.rb
|
17
|
-
lib/textutils/helper/hypertext_helper.rb
|
18
|
-
lib/textutils/helper/tag_helper.rb
|
19
|
-
lib/textutils/helper/title_helper.rb
|
20
|
-
lib/textutils/helper/unicode_helper.rb
|
21
|
-
lib/textutils/helper/value_helper_i.rb
|
22
|
-
lib/textutils/helper/value_helper_ii.rb
|
23
|
-
lib/textutils/helper/value_helper_iii_numbers.rb
|
24
|
-
lib/textutils/helper/xml_helper.rb
|
25
|
-
lib/textutils/page.rb
|
26
|
-
lib/textutils/parser/name_parser.rb
|
27
|
-
lib/textutils/parser/name_tokenizer.rb
|
28
|
-
lib/textutils/patterns.rb
|
29
|
-
lib/textutils/reader/block_reader.rb
|
30
|
-
lib/textutils/reader/code_reader.rb
|
31
|
-
lib/textutils/reader/fixture_reader.rb
|
32
|
-
lib/textutils/reader/hash_reader.rb
|
33
|
-
lib/textutils/reader/line_reader.rb
|
34
|
-
lib/textutils/reader/tree_reader.rb
|
35
|
-
lib/textutils/reader/values_reader.rb
|
36
|
-
lib/textutils/sanitizier.rb
|
37
|
-
lib/textutils/title.rb
|
38
|
-
lib/textutils/title_mapper.rb
|
39
|
-
lib/textutils/
|
40
|
-
lib/textutils/
|
41
|
-
|
42
|
-
test/data/
|
43
|
-
test/data/
|
44
|
-
test/data/de-deutschland/3--by-bayern/4--oberfranken/
|
45
|
-
test/data/de-deutschland/
|
46
|
-
test/data/
|
47
|
-
test/
|
48
|
-
test/
|
49
|
-
test/
|
50
|
-
test/
|
51
|
-
test/
|
52
|
-
test/
|
53
|
-
test/
|
54
|
-
test/
|
55
|
-
test/
|
56
|
-
test/
|
57
|
-
test/
|
58
|
-
test/
|
59
|
-
test/
|
60
|
-
test/
|
61
|
-
test/
|
1
|
+
HISTORY.md
|
2
|
+
Manifest.txt
|
3
|
+
README.md
|
4
|
+
Rakefile
|
5
|
+
lib/textutils.rb
|
6
|
+
lib/textutils/classifier.rb
|
7
|
+
lib/textutils/core_ext/array.rb
|
8
|
+
lib/textutils/core_ext/file.rb
|
9
|
+
lib/textutils/core_ext/time.rb
|
10
|
+
lib/textutils/filter/code_filter.rb
|
11
|
+
lib/textutils/filter/comment_filter.rb
|
12
|
+
lib/textutils/filter/erb_django_filter.rb
|
13
|
+
lib/textutils/filter/erb_filter.rb
|
14
|
+
lib/textutils/filter/string_filter.rb
|
15
|
+
lib/textutils/helper/address_helper.rb
|
16
|
+
lib/textutils/helper/date_helper.rb
|
17
|
+
lib/textutils/helper/hypertext_helper.rb
|
18
|
+
lib/textutils/helper/tag_helper.rb
|
19
|
+
lib/textutils/helper/title_helper.rb
|
20
|
+
lib/textutils/helper/unicode_helper.rb
|
21
|
+
lib/textutils/helper/value_helper_i.rb
|
22
|
+
lib/textutils/helper/value_helper_ii.rb
|
23
|
+
lib/textutils/helper/value_helper_iii_numbers.rb
|
24
|
+
lib/textutils/helper/xml_helper.rb
|
25
|
+
lib/textutils/page.rb
|
26
|
+
lib/textutils/parser/name_parser.rb
|
27
|
+
lib/textutils/parser/name_tokenizer.rb
|
28
|
+
lib/textutils/patterns.rb
|
29
|
+
lib/textutils/reader/block_reader.rb
|
30
|
+
lib/textutils/reader/code_reader.rb
|
31
|
+
lib/textutils/reader/fixture_reader.rb
|
32
|
+
lib/textutils/reader/hash_reader.rb
|
33
|
+
lib/textutils/reader/line_reader.rb
|
34
|
+
lib/textutils/reader/tree_reader.rb
|
35
|
+
lib/textutils/reader/values_reader.rb
|
36
|
+
lib/textutils/sanitizier.rb
|
37
|
+
lib/textutils/title.rb
|
38
|
+
lib/textutils/title_mapper.rb
|
39
|
+
lib/textutils/title_mapper2.rb
|
40
|
+
lib/textutils/utils.rb
|
41
|
+
lib/textutils/version.rb
|
42
|
+
test/data/at-austria/1--n-niederoesterreich/orte.txt
|
43
|
+
test/data/cl_all.txt
|
44
|
+
test/data/de-deutschland/3--by-bayern/4--oberfranken/orte.txt
|
45
|
+
test/data/de-deutschland/3--by-bayern/4--oberfranken/orte_ii.txt
|
46
|
+
test/data/de-deutschland/orte.txt
|
47
|
+
test/data/feedburner.txt
|
48
|
+
test/helper.rb
|
49
|
+
test/test_address_helper.rb
|
50
|
+
test/test_asciify.rb
|
51
|
+
test/test_block_reader.rb
|
52
|
+
test/test_fixture_reader.rb
|
53
|
+
test/test_hypertext_helper.rb
|
54
|
+
test/test_slugify.rb
|
55
|
+
test/test_taglist.rb
|
56
|
+
test/test_title_finder.rb
|
57
|
+
test/test_title_helper.rb
|
58
|
+
test/test_title_mapper.rb
|
59
|
+
test/test_title_mapper2.rb
|
60
|
+
test/test_tree_reader.rb
|
61
|
+
test/test_tree_reader_ii.rb
|
62
|
+
test/test_unicode_helper.rb
|
63
|
+
test/test_values_reader.rb
|
data/Rakefile
CHANGED
@@ -21,7 +21,7 @@ Hoe.spec 'textutils' do
|
|
21
21
|
['props', '>=1.1.2'],
|
22
22
|
['logutils', '>=0.6.1'],
|
23
23
|
### 3rd party gems
|
24
|
-
['rubyzip'], ## todo/check: make optional -why? why not??
|
24
|
+
['rubyzip', '>=1.0.0'], ## note: 1.0 changed to require zip (pre 1.0 was zip/zip); todo/check: make optional -why? why not??
|
25
25
|
['activesupport'] ## todo/check: really needed? document what methods get used
|
26
26
|
]
|
27
27
|
|
data/lib/textutils.rb
CHANGED
@@ -66,6 +66,7 @@ require 'textutils/reader/tree_reader'
|
|
66
66
|
require 'textutils/classifier'
|
67
67
|
require 'textutils/title' # title table/mapper/finder utils
|
68
68
|
require 'textutils/title_mapper'
|
69
|
+
require 'textutils/title_mapper2'
|
69
70
|
|
70
71
|
require 'textutils/page' # for book pages and page templates
|
71
72
|
|
@@ -0,0 +1,168 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
## see textutils/title.rb
|
5
|
+
## for existing code
|
6
|
+
## move over here
|
7
|
+
|
8
|
+
|
9
|
+
module TextUtils
|
10
|
+
|
11
|
+
class TitleMapper2 ## todo/check: rename to NameMapper ? why? why not??
|
12
|
+
|
13
|
+
include LogUtils::Logging
|
14
|
+
|
15
|
+
attr_reader :known_titles ## rename to mapping or mappings or just titles - why? why not?
|
16
|
+
|
17
|
+
##
|
18
|
+
## key: e.g. augsburg
|
19
|
+
## title: e.g. FC Augsburg
|
20
|
+
## length (of title - not pattern): e.g. 11 -- do not count dots (e.g. U.S.A. => 3 or 6) why? why not?
|
21
|
+
MappingStruct = Struct.new( :key, :title, :length, :pattern) ## todo/check: use (rename to) TitleStruct - why? why not??
|
22
|
+
|
23
|
+
|
24
|
+
def initialize( records, tag )
|
25
|
+
@known_titles = build_title_table_for( records ) ## build mapping lookup table
|
26
|
+
|
27
|
+
## todo: rename tag to attrib or attrib_name - why ?? why not ???
|
28
|
+
@tag = tag # e.g. tag name use for @@brewery@@ @@team@@ etc.
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
def map_titles!( line ) ## rename to just map! - why?? why not???
|
33
|
+
begin
|
34
|
+
found = map_title_for!( @tag, line, @known_titles )
|
35
|
+
end while found
|
36
|
+
end
|
37
|
+
|
38
|
+
def find_key!( line )
|
39
|
+
find_key_for!( @tag, line )
|
40
|
+
end
|
41
|
+
|
42
|
+
def find_keys!( line ) # NB: keys (plural!) - will return array
|
43
|
+
counter = 1
|
44
|
+
keys = []
|
45
|
+
|
46
|
+
key = find_key_for!( "#{@tag}#{counter}", line )
|
47
|
+
while key.present?
|
48
|
+
keys << key
|
49
|
+
counter += 1
|
50
|
+
key = find_key_for!( "#{@tag}#{counter}", line )
|
51
|
+
end
|
52
|
+
keys
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
private
|
57
|
+
def build_title_table_for( records )
|
58
|
+
|
59
|
+
## build known tracks table w/ synonyms e.g.
|
60
|
+
#
|
61
|
+
# [[ 'wolfsbrug', 'VfL Wolfsburg'],
|
62
|
+
# [ 'augsburg', 'FC Augsburg'],
|
63
|
+
# [ 'augsburg', 'Augi2'],
|
64
|
+
# [ 'augsburg', 'Augi3' ],
|
65
|
+
# [ 'stuttgart', 'VfB Stuttgart']]
|
66
|
+
|
67
|
+
known_titles = []
|
68
|
+
|
69
|
+
records.each_with_index do |rec,index|
|
70
|
+
|
71
|
+
title_candidates = []
|
72
|
+
title_candidates << rec.title
|
73
|
+
|
74
|
+
title_candidates += rec.synonyms.split('|') if rec.synonyms.present?
|
75
|
+
|
76
|
+
|
77
|
+
## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
|
78
|
+
# make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
|
79
|
+
|
80
|
+
titles = []
|
81
|
+
title_candidates.each do |t|
|
82
|
+
titles << t
|
83
|
+
if t =~ /\(.+\)/
|
84
|
+
extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
|
85
|
+
# note: strip leading n trailing withspaces too!
|
86
|
+
# -- todo: add squish or something if () is inline e.g. leaves two spaces?
|
87
|
+
extra_title.strip!
|
88
|
+
titles << extra_title
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
titles.each do |t|
|
93
|
+
m = MappingStruct.new
|
94
|
+
m.key = rec.key
|
95
|
+
m.title = t
|
96
|
+
m.length = t.length
|
97
|
+
## note: escape for regex plus allow subs for special chars/accents
|
98
|
+
m.pattern = TextUtils.title_esc_regex( t )
|
99
|
+
|
100
|
+
known_titles << m
|
101
|
+
end
|
102
|
+
|
103
|
+
logger.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
|
104
|
+
|
105
|
+
## NB: only include code field - if defined
|
106
|
+
if rec.respond_to?(:code) && rec.code.present?
|
107
|
+
m = MappingStruct.new
|
108
|
+
m.key = rec.key
|
109
|
+
m.title = rec.code
|
110
|
+
m.length = rec.code.length
|
111
|
+
m.pattern = rec.code ## note: use code for now as is (no variants allowed fow now)
|
112
|
+
|
113
|
+
known_titles << m
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
## note: sort here by length (largest goes first - best match)
|
118
|
+
# exclude code and key (key should always go last)
|
119
|
+
known_titles = known_titles.sort { |left,right| right.length <=> left.length }
|
120
|
+
known_titles
|
121
|
+
end
|
122
|
+
|
123
|
+
|
124
|
+
def map_title_for!( tag, line, mappings )
|
125
|
+
|
126
|
+
downcase_tag = tag.downcase
|
127
|
+
|
128
|
+
mappings.each do |mapping|
|
129
|
+
|
130
|
+
key = mapping.key
|
131
|
+
value = mapping.pattern
|
132
|
+
## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
|
133
|
+
## (thus add it, allows match for Benfica Lis. for example - note . at the end)
|
134
|
+
|
135
|
+
## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
|
136
|
+
regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
|
137
|
+
if line =~ regex
|
138
|
+
logger.debug " match for #{downcase_tag} >#{key}< >#{value}<"
|
139
|
+
# make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
|
140
|
+
line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end
|
141
|
+
return true # break out after first match (do NOT continue)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
return false
|
145
|
+
end
|
146
|
+
|
147
|
+
|
148
|
+
def find_key_for!( tag, line )
|
149
|
+
regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
|
150
|
+
|
151
|
+
upcase_tag = tag.upcase
|
152
|
+
downcase_tag = tag.downcase
|
153
|
+
|
154
|
+
if line =~ regex
|
155
|
+
value = "#{$1}"
|
156
|
+
logger.debug " #{downcase_tag}: >#{value}<"
|
157
|
+
|
158
|
+
line.sub!( regex, "[#{upcase_tag}]" )
|
159
|
+
|
160
|
+
return $1
|
161
|
+
else
|
162
|
+
return nil
|
163
|
+
end
|
164
|
+
end # method find_key_for!
|
165
|
+
|
166
|
+
|
167
|
+
end # class TitleMapper2
|
168
|
+
end # module TextUtils
|
data/lib/textutils/version.rb
CHANGED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_title_mapper2.rb
|
6
|
+
|
7
|
+
|
8
|
+
require 'helper'
|
9
|
+
|
10
|
+
|
11
|
+
class TestTitleMapper2 < Minitest::Test
|
12
|
+
|
13
|
+
ClubStruct = Struct.new(:key, :title, :synonyms)
|
14
|
+
|
15
|
+
def test_title_table
|
16
|
+
|
17
|
+
titles_in = [
|
18
|
+
ClubStruct.new( 'barcelona', 'Barcelona', 'FC Barcelona' ),
|
19
|
+
ClubStruct.new( 'espanyol', 'Espanyol', 'RCD Espanyol|Espanyol Barcelona' ),
|
20
|
+
ClubStruct.new( 'sevilla', 'Sevilla', 'Sevilla FC' )
|
21
|
+
]
|
22
|
+
|
23
|
+
mapper = TextUtils::TitleMapper2.new( titles_in, 'club' )
|
24
|
+
titles_out = mapper.known_titles
|
25
|
+
|
26
|
+
puts 'titles_out:'
|
27
|
+
pp titles_out
|
28
|
+
|
29
|
+
line = "Espanyol Barcelona 1-0 FC Barcelona"
|
30
|
+
mapper.map_titles!( line )
|
31
|
+
puts "=> #{line}"
|
32
|
+
|
33
|
+
club1 = mapper.find_key!( line )
|
34
|
+
club2 = mapper.find_key!( line )
|
35
|
+
puts "=> #{line}"
|
36
|
+
|
37
|
+
assert_equal 'espanyol', club1
|
38
|
+
assert_equal 'barcelona', club2
|
39
|
+
|
40
|
+
assert true ## assume everything ok if we get here
|
41
|
+
|
42
|
+
end # method test_title_table
|
43
|
+
|
44
|
+
|
45
|
+
end # class TestTitleMapper2
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gerald Bauer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: props
|
@@ -44,14 +44,14 @@ dependencies:
|
|
44
44
|
requirements:
|
45
45
|
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 1.0.0
|
48
48
|
type: :runtime
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 1.0.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: activesupport
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -86,14 +86,14 @@ dependencies:
|
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version: '3.
|
89
|
+
version: '3.14'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
94
|
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version: '3.
|
96
|
+
version: '3.14'
|
97
97
|
description: textutils - Text Filters, Helpers, Readers and More
|
98
98
|
email: ruby-talk@ruby-lang.org
|
99
99
|
executables: []
|
@@ -103,7 +103,6 @@ extra_rdoc_files:
|
|
103
103
|
- Manifest.txt
|
104
104
|
- README.md
|
105
105
|
files:
|
106
|
-
- ".gemtest"
|
107
106
|
- HISTORY.md
|
108
107
|
- Manifest.txt
|
109
108
|
- README.md
|
@@ -142,6 +141,7 @@ files:
|
|
142
141
|
- lib/textutils/sanitizier.rb
|
143
142
|
- lib/textutils/title.rb
|
144
143
|
- lib/textutils/title_mapper.rb
|
144
|
+
- lib/textutils/title_mapper2.rb
|
145
145
|
- lib/textutils/utils.rb
|
146
146
|
- lib/textutils/version.rb
|
147
147
|
- test/data/at-austria/1--n-niederoesterreich/orte.txt
|
@@ -161,6 +161,7 @@ files:
|
|
161
161
|
- test/test_title_finder.rb
|
162
162
|
- test/test_title_helper.rb
|
163
163
|
- test/test_title_mapper.rb
|
164
|
+
- test/test_title_mapper2.rb
|
164
165
|
- test/test_tree_reader.rb
|
165
166
|
- test/test_tree_reader_ii.rb
|
166
167
|
- test/test_unicode_helper.rb
|
@@ -187,22 +188,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
187
188
|
version: '0'
|
188
189
|
requirements: []
|
189
190
|
rubyforge_project:
|
190
|
-
rubygems_version: 2.
|
191
|
+
rubygems_version: 2.2.3
|
191
192
|
signing_key:
|
192
193
|
specification_version: 4
|
193
194
|
summary: textutils - Text Filters, Helpers, Readers and More
|
194
|
-
test_files:
|
195
|
-
- test/test_title_finder.rb
|
196
|
-
- test/test_fixture_reader.rb
|
197
|
-
- test/test_unicode_helper.rb
|
198
|
-
- test/test_asciify.rb
|
199
|
-
- test/test_tree_reader.rb
|
200
|
-
- test/test_title_mapper.rb
|
201
|
-
- test/test_values_reader.rb
|
202
|
-
- test/test_taglist.rb
|
203
|
-
- test/test_hypertext_helper.rb
|
204
|
-
- test/test_title_helper.rb
|
205
|
-
- test/test_slugify.rb
|
206
|
-
- test/test_address_helper.rb
|
207
|
-
- test/test_block_reader.rb
|
208
|
-
- test/test_tree_reader_ii.rb
|
195
|
+
test_files: []
|
data/.gemtest
DELETED
File without changes
|