textutils 0.8.5 → 0.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +1 -0
- data/lib/textutils/title.rb +152 -0
- data/lib/textutils/version.rb +1 -3
- data/lib/textutils.rb +1 -0
- metadata +9 -8
data/Manifest.txt
CHANGED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# encoding: utf-8
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
## todo: rename to TitleFinder or TitleMapper ??
|
|
5
|
+
# other options TitleMatcher?
|
|
6
|
+
# TitleMapping? TitleMappings?
|
|
7
|
+
# or rename to KeyMapping?, KeyMapper?, KeyTable? etc.
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
module TextUtils
|
|
11
|
+
module TitleTable
|
|
12
|
+
|
|
13
|
+
####
|
|
14
|
+
## fix: turn it into a class w/ methods
|
|
15
|
+
#
|
|
16
|
+
#e.g t =TitleTable.new( records, name ) # e.g. name='team'
|
|
17
|
+
# t.map!( line )
|
|
18
|
+
# t.find_key!( line )
|
|
19
|
+
# etc.
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def build_title_table_for( records )
|
|
23
|
+
## build known tracks table w/ synonyms e.g.
|
|
24
|
+
#
|
|
25
|
+
# [[ 'wolfsbrug', [ 'VfL Wolfsburg' ]],
|
|
26
|
+
# [ 'augsburg', [ 'FC Augsburg', 'Augi2', 'Augi3' ]],
|
|
27
|
+
# [ 'stuttgart', [ 'VfB Stuttgart' ]] ]
|
|
28
|
+
|
|
29
|
+
known_titles = []
|
|
30
|
+
|
|
31
|
+
records.each_with_index do |rec,index|
|
|
32
|
+
|
|
33
|
+
title_candidates = []
|
|
34
|
+
title_candidates << rec.title
|
|
35
|
+
|
|
36
|
+
title_candidates += rec.synonyms.split('|') if rec.synonyms.present?
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
|
|
40
|
+
# make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
|
|
41
|
+
|
|
42
|
+
titles = []
|
|
43
|
+
title_candidates.each do |t|
|
|
44
|
+
titles << t
|
|
45
|
+
if t =~ /\(.+\)/
|
|
46
|
+
extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
|
|
47
|
+
extra_title.strip! # strip leading n trailing withspaces too!
|
|
48
|
+
titles << extra_title
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
## NB: sort here by length (largest goes first - best match)
|
|
54
|
+
# exclude code and key (key should always go last)
|
|
55
|
+
titles = titles.sort { |left,right| right.length <=> left.length }
|
|
56
|
+
|
|
57
|
+
## escape for regex plus allow subs for special chars/accents
|
|
58
|
+
titles = titles.map { |title| TextUtils.title_esc_regex( title ) }
|
|
59
|
+
|
|
60
|
+
## NB: only include code field - if defined
|
|
61
|
+
titles << rec.code if rec.respond_to?(:code) && rec.code.present?
|
|
62
|
+
|
|
63
|
+
known_titles << [ rec.key, titles ]
|
|
64
|
+
|
|
65
|
+
### fix: use plain logger
|
|
66
|
+
LogUtils::Logger.root.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
known_titles
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def find_key_for!( name, line )
|
|
75
|
+
regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
|
|
76
|
+
|
|
77
|
+
upcase_name = name.upcase
|
|
78
|
+
downcase_name = name.downcase
|
|
79
|
+
|
|
80
|
+
if line =~ regex
|
|
81
|
+
value = "#{$1}"
|
|
82
|
+
### fix: use plain logger
|
|
83
|
+
LogUtils::Logger.root.debug " #{downcase_name}: >#{value}<"
|
|
84
|
+
|
|
85
|
+
line.sub!( regex, "[#{upcase_name}]" )
|
|
86
|
+
|
|
87
|
+
return $1
|
|
88
|
+
else
|
|
89
|
+
return nil
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def find_keys_for!( name, line ) # NB: keys (plural!) - will return array
|
|
95
|
+
counter = 1
|
|
96
|
+
keys = []
|
|
97
|
+
|
|
98
|
+
downcase_name = name.downcase
|
|
99
|
+
|
|
100
|
+
key = find_key_for!( "#{downcase_name}#{counter}", line )
|
|
101
|
+
while key.present?
|
|
102
|
+
keys << key
|
|
103
|
+
counter += 1
|
|
104
|
+
key = find_key_for!( "#{downcase_name}#{counter}", line )
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
keys
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def map_titles_for!( name, line, title_table )
|
|
112
|
+
title_table.each do |rec|
|
|
113
|
+
key = rec[0]
|
|
114
|
+
values = rec[1]
|
|
115
|
+
map_title_worker_for!( name, line, key, values )
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def map_title_worker_for!( name, line, key, values )
|
|
121
|
+
|
|
122
|
+
downcase_name = name.downcase
|
|
123
|
+
|
|
124
|
+
values.each do |value|
|
|
125
|
+
## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
|
|
126
|
+
## (thus add it, allows match for Benfica Lis. for example - note . at the end)
|
|
127
|
+
|
|
128
|
+
## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
|
|
129
|
+
regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
|
|
130
|
+
if line =~ regex
|
|
131
|
+
### fix: use plain logger
|
|
132
|
+
LogUtils::Logger.root.debug " match for #{downcase_name} >#{key}< >#{value}<"
|
|
133
|
+
# make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
|
|
134
|
+
line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end
|
|
135
|
+
return true # break out after first match (do NOT continue)
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
return false
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
end # module TitleTable
|
|
142
|
+
end # module TextUtils
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
## auto-include methods
|
|
146
|
+
|
|
147
|
+
module TextUtils
|
|
148
|
+
# make helpers available as class methods e.g. TextUtils.convert_unicode_dashes_to_plain_ascii
|
|
149
|
+
extend TitleTable # lets us use TextUtils.build_title_table_for etc.
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
|
data/lib/textutils/version.rb
CHANGED
data/lib/textutils.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: textutils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.8.
|
|
4
|
+
version: 0.8.6
|
|
5
5
|
prerelease:
|
|
6
6
|
platform: ruby
|
|
7
7
|
authors:
|
|
@@ -9,11 +9,11 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: bin
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2014-02-
|
|
12
|
+
date: 2014-02-16 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: logutils
|
|
16
|
-
requirement: &
|
|
16
|
+
requirement: &20840748 !ruby/object:Gem::Requirement
|
|
17
17
|
none: false
|
|
18
18
|
requirements:
|
|
19
19
|
- - ~>
|
|
@@ -21,10 +21,10 @@ dependencies:
|
|
|
21
21
|
version: '0.5'
|
|
22
22
|
type: :runtime
|
|
23
23
|
prerelease: false
|
|
24
|
-
version_requirements: *
|
|
24
|
+
version_requirements: *20840748
|
|
25
25
|
- !ruby/object:Gem::Dependency
|
|
26
26
|
name: rdoc
|
|
27
|
-
requirement: &
|
|
27
|
+
requirement: &20840412 !ruby/object:Gem::Requirement
|
|
28
28
|
none: false
|
|
29
29
|
requirements:
|
|
30
30
|
- - ~>
|
|
@@ -32,10 +32,10 @@ dependencies:
|
|
|
32
32
|
version: '4.0'
|
|
33
33
|
type: :development
|
|
34
34
|
prerelease: false
|
|
35
|
-
version_requirements: *
|
|
35
|
+
version_requirements: *20840412
|
|
36
36
|
- !ruby/object:Gem::Dependency
|
|
37
37
|
name: hoe
|
|
38
|
-
requirement: &
|
|
38
|
+
requirement: &20839944 !ruby/object:Gem::Requirement
|
|
39
39
|
none: false
|
|
40
40
|
requirements:
|
|
41
41
|
- - ~>
|
|
@@ -43,7 +43,7 @@ dependencies:
|
|
|
43
43
|
version: '3.7'
|
|
44
44
|
type: :development
|
|
45
45
|
prerelease: false
|
|
46
|
-
version_requirements: *
|
|
46
|
+
version_requirements: *20839944
|
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
|
48
48
|
email: ruby-talk@ruby-lang.org
|
|
49
49
|
executables: []
|
|
@@ -79,6 +79,7 @@ files:
|
|
|
79
79
|
- lib/textutils/reader/line_reader.rb
|
|
80
80
|
- lib/textutils/reader/values_reader.rb
|
|
81
81
|
- lib/textutils/sanitizier.rb
|
|
82
|
+
- lib/textutils/title.rb
|
|
82
83
|
- lib/textutils/utils.rb
|
|
83
84
|
- lib/textutils/version.rb
|
|
84
85
|
- test/helper.rb
|