textutils 0.8.5 → 0.8.6
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +1 -0
- data/lib/textutils/title.rb +152 -0
- data/lib/textutils/version.rb +1 -3
- data/lib/textutils.rb +1 -0
- metadata +9 -8
data/Manifest.txt
CHANGED
@@ -0,0 +1,152 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
## todo: rename to TitleFinder or TitleMapper ??
|
5
|
+
# other options TitleMatcher?
|
6
|
+
# TitleMapping? TitleMappings?
|
7
|
+
# or rename to KeyMapping?, KeyMapper?, KeyTable? etc.
|
8
|
+
|
9
|
+
|
10
|
+
module TextUtils
|
11
|
+
module TitleTable
|
12
|
+
|
13
|
+
####
|
14
|
+
## fix: turn it into a class w/ methods
|
15
|
+
#
|
16
|
+
#e.g t =TitleTable.new( records, name ) # e.g. name='team'
|
17
|
+
# t.map!( line )
|
18
|
+
# t.find_key!( line )
|
19
|
+
# etc.
|
20
|
+
|
21
|
+
|
22
|
+
def build_title_table_for( records )
|
23
|
+
## build known tracks table w/ synonyms e.g.
|
24
|
+
#
|
25
|
+
# [[ 'wolfsbrug', [ 'VfL Wolfsburg' ]],
|
26
|
+
# [ 'augsburg', [ 'FC Augsburg', 'Augi2', 'Augi3' ]],
|
27
|
+
# [ 'stuttgart', [ 'VfB Stuttgart' ]] ]
|
28
|
+
|
29
|
+
known_titles = []
|
30
|
+
|
31
|
+
records.each_with_index do |rec,index|
|
32
|
+
|
33
|
+
title_candidates = []
|
34
|
+
title_candidates << rec.title
|
35
|
+
|
36
|
+
title_candidates += rec.synonyms.split('|') if rec.synonyms.present?
|
37
|
+
|
38
|
+
|
39
|
+
## check if title includes subtitle e.g. Grand Prix Japan (Suzuka Circuit)
|
40
|
+
# make subtitle optional by adding title w/o subtitle e.g. Grand Prix Japan
|
41
|
+
|
42
|
+
titles = []
|
43
|
+
title_candidates.each do |t|
|
44
|
+
titles << t
|
45
|
+
if t =~ /\(.+\)/
|
46
|
+
extra_title = t.gsub( /\(.+\)/, '' ) # remove/delete subtitles
|
47
|
+
extra_title.strip! # strip leading n trailing withspaces too!
|
48
|
+
titles << extra_title
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
## NB: sort here by length (largest goes first - best match)
|
54
|
+
# exclude code and key (key should always go last)
|
55
|
+
titles = titles.sort { |left,right| right.length <=> left.length }
|
56
|
+
|
57
|
+
## escape for regex plus allow subs for special chars/accents
|
58
|
+
titles = titles.map { |title| TextUtils.title_esc_regex( title ) }
|
59
|
+
|
60
|
+
## NB: only include code field - if defined
|
61
|
+
titles << rec.code if rec.respond_to?(:code) && rec.code.present?
|
62
|
+
|
63
|
+
known_titles << [ rec.key, titles ]
|
64
|
+
|
65
|
+
### fix: use plain logger
|
66
|
+
LogUtils::Logger.root.debug " #{rec.class.name}[#{index+1}] #{rec.key} >#{titles.join('|')}<"
|
67
|
+
end
|
68
|
+
|
69
|
+
known_titles
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
|
74
|
+
def find_key_for!( name, line )
|
75
|
+
regex = /@@oo([^@]+?)oo@@/ # e.g. everything in @@ .... @@ (use non-greedy +? plus all chars but not @, that is [^@])
|
76
|
+
|
77
|
+
upcase_name = name.upcase
|
78
|
+
downcase_name = name.downcase
|
79
|
+
|
80
|
+
if line =~ regex
|
81
|
+
value = "#{$1}"
|
82
|
+
### fix: use plain logger
|
83
|
+
LogUtils::Logger.root.debug " #{downcase_name}: >#{value}<"
|
84
|
+
|
85
|
+
line.sub!( regex, "[#{upcase_name}]" )
|
86
|
+
|
87
|
+
return $1
|
88
|
+
else
|
89
|
+
return nil
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
|
94
|
+
def find_keys_for!( name, line ) # NB: keys (plural!) - will return array
|
95
|
+
counter = 1
|
96
|
+
keys = []
|
97
|
+
|
98
|
+
downcase_name = name.downcase
|
99
|
+
|
100
|
+
key = find_key_for!( "#{downcase_name}#{counter}", line )
|
101
|
+
while key.present?
|
102
|
+
keys << key
|
103
|
+
counter += 1
|
104
|
+
key = find_key_for!( "#{downcase_name}#{counter}", line )
|
105
|
+
end
|
106
|
+
|
107
|
+
keys
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
def map_titles_for!( name, line, title_table )
|
112
|
+
title_table.each do |rec|
|
113
|
+
key = rec[0]
|
114
|
+
values = rec[1]
|
115
|
+
map_title_worker_for!( name, line, key, values )
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
def map_title_worker_for!( name, line, key, values )
|
121
|
+
|
122
|
+
downcase_name = name.downcase
|
123
|
+
|
124
|
+
values.each do |value|
|
125
|
+
## nb: \b does NOT include space or newline for word boundry (only alphanums e.g. a-z0-9)
|
126
|
+
## (thus add it, allows match for Benfica Lis. for example - note . at the end)
|
127
|
+
|
128
|
+
## check add $ e.g. (\b| |\t|$) does this work? - check w/ Benfica Lis.$
|
129
|
+
regex = /\b#{value}(\b| |\t|$)/ # wrap with world boundry (e.g. match only whole words e.g. not wac in wacker)
|
130
|
+
if line =~ regex
|
131
|
+
### fix: use plain logger
|
132
|
+
LogUtils::Logger.root.debug " match for #{downcase_name} >#{key}< >#{value}<"
|
133
|
+
# make sure @@oo{key}oo@@ doesn't match itself with other key e.g. wacker, wac, etc.
|
134
|
+
line.sub!( regex, "@@oo#{key}oo@@ " ) # NB: add one space char at end
|
135
|
+
return true # break out after first match (do NOT continue)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
return false
|
139
|
+
end
|
140
|
+
|
141
|
+
end # module TitleTable
|
142
|
+
end # module TextUtils
|
143
|
+
|
144
|
+
|
145
|
+
## auto-include methods
|
146
|
+
|
147
|
+
module TextUtils
|
148
|
+
# make helpers available as class methods e.g. TextUtils.convert_unicode_dashes_to_plain_ascii
|
149
|
+
extend TitleTable # lets us use TextUtils.build_title_table_for etc.
|
150
|
+
end
|
151
|
+
|
152
|
+
|
data/lib/textutils/version.rb
CHANGED
data/lib/textutils.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-02-
|
12
|
+
date: 2014-02-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &20840748 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *20840748
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rdoc
|
27
|
-
requirement: &
|
27
|
+
requirement: &20840412 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '4.0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *20840412
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: hoe
|
38
|
-
requirement: &
|
38
|
+
requirement: &20839944 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '3.7'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *20839944
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
48
48
|
email: ruby-talk@ruby-lang.org
|
49
49
|
executables: []
|
@@ -79,6 +79,7 @@ files:
|
|
79
79
|
- lib/textutils/reader/line_reader.rb
|
80
80
|
- lib/textutils/reader/values_reader.rb
|
81
81
|
- lib/textutils/sanitizier.rb
|
82
|
+
- lib/textutils/title.rb
|
82
83
|
- lib/textutils/utils.rb
|
83
84
|
- lib/textutils/version.rb
|
84
85
|
- test/helper.rb
|