lcbo 0.9.6 → 0.9.7
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +4 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +2 -0
- data/lcbo.gemspec +1 -0
- data/lib/lcbo/crawlkit.rb +1 -0
- data/lib/lcbo/crawlkit/response.rb +3 -3
- data/lib/lcbo/crawlkit/titlecase_helper.rb +39 -70
- data/lib/lcbo/version.rb +1 -1
- data/spec/crawlkit/titlecase_helper_spec.rb +1 -7
- metadata +16 -3
data/CHANGELOG.md
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
data/lcbo.gemspec
CHANGED
data/lib/lcbo/crawlkit.rb
CHANGED
@@ -18,10 +18,10 @@ module LCBO
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def self.normalize_encoding(html)
|
21
|
-
if html.valid_encoding?
|
22
|
-
html
|
23
|
-
else
|
21
|
+
if html.force_encoding('ISO-8859-1').valid_encoding?
|
24
22
|
html.encode('UTF-8', 'ISO-8859-1')
|
23
|
+
else
|
24
|
+
html.encode('UTF-8')
|
25
25
|
end.gsub("\r\n", "\n")
|
26
26
|
end
|
27
27
|
|
@@ -4,92 +4,61 @@ module LCBO
|
|
4
4
|
module CrawlKit
|
5
5
|
class TitleCaseHelper
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
SMALL_WORDS = %w[
|
8
|
+
a an and as at but by en for
|
9
|
+
if in of del de on or the to
|
10
|
+
v v. via vs vs.
|
11
|
+
]
|
12
|
+
|
13
|
+
ACRONYMS = %w[
|
14
|
+
vqa vsop xo nq5 vs xxx igt
|
15
|
+
xiii xi xoxo srl bdb cvbg
|
16
|
+
ocb lcbo i ii iii
|
17
|
+
]
|
16
18
|
|
17
19
|
attr_reader :input
|
18
20
|
|
19
21
|
def self.[](string)
|
20
|
-
|
22
|
+
titlecase(string)
|
21
23
|
end
|
22
24
|
|
23
25
|
def self.upcase(string)
|
24
|
-
|
26
|
+
UnicodeUtils.simple_upcase(string)
|
25
27
|
end
|
26
28
|
|
27
29
|
def self.downcase(string)
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
def self.preclean(string)
|
32
|
-
# Strip useless bracketed crap: Some Product Name (Some Redundant Stuff)**
|
33
|
-
string.gsub(/\(.+\Z/, '').
|
34
|
-
# Strip trailing stars.
|
35
|
-
gsub(/\*+\Z/, '')
|
30
|
+
UnicodeUtils.simple_downcase(string)
|
36
31
|
end
|
37
32
|
|
38
33
|
def self.capitalize(string)
|
39
|
-
|
40
|
-
if first_letter
|
41
|
-
uchar = upcase(first_letter)
|
42
|
-
string.sub(/#{first_letter}/u, uchar)
|
43
|
-
else
|
44
|
-
string
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def self.titleize(string)
|
49
|
-
phrases(preclean(downcase(string))).map do |phrase|
|
50
|
-
words = phrase.split
|
51
|
-
words.map do |word|
|
52
|
-
def word.capitalize
|
53
|
-
self.sub(ALPHA_RE) { |subword| TitleCaseHelper.capitalize(subword) }
|
54
|
-
end
|
55
|
-
case word
|
56
|
-
when *(ACRONYMS + ACRONYMS.map { |ac| capitalize(ac) })
|
57
|
-
upcase(word)
|
58
|
-
when /#{ALPHA_RANGE}\&#{ALPHA_RANGE}/u # words with &, like E&J
|
59
|
-
word.split(/\&/).map { |w| capitalize(w) }.join('&')
|
60
|
-
when /#{ALPHA_RANGE}\-#{ALPHA_RANGE}/u # words with dashes, like "Smith-Weston"
|
61
|
-
word.split(/\-/).map { |w| capitalize(w) }.join('-')
|
62
|
-
when /#{ALPHA_RANGE}\/#{ALPHA_RANGE}/u # words with slashes
|
63
|
-
word.split(/\//).map { |w| capitalize(w) }.join(' / ')
|
64
|
-
when /#{ALPHA_RANGE}\.#{ALPHA_RANGE}/u # words with dots, like "example.com"
|
65
|
-
capitalized = word.split(/\./u).map { |w| capitalize(w) }.join('.')
|
66
|
-
'.' == word[-1, 1] ? capitalized + '.' : capitalized
|
67
|
-
when /^#{ALPHA_RANGE}.*#{UPPER_RANGE}/u # non-first letter capitalized already
|
68
|
-
word
|
69
|
-
when words.first, words.last
|
70
|
-
word.capitalize
|
71
|
-
when *(SMALL_WORDS + SMALL_WORDS.map { |small| capitalize(small) })
|
72
|
-
word.downcase
|
73
|
-
else
|
74
|
-
word.capitalize
|
75
|
-
end
|
76
|
-
end.join(' ')
|
77
|
-
end.join(' ').
|
78
|
-
# Special case for Word'S
|
79
|
-
gsub(/(['’])S\b/, '\1s')
|
34
|
+
UnicodeUtils.titlecase(string)
|
80
35
|
end
|
81
36
|
|
82
|
-
def self.
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
37
|
+
def self.titlecase(string)
|
38
|
+
preclean = lambda { |s|
|
39
|
+
# Strip bracketed stuff and trailing junk: Product (Junk)**
|
40
|
+
s.gsub(/\(.+\Z/, '').gsub(/\*+\Z/, '').strip
|
41
|
+
}
|
42
|
+
count = 0 # Ewwww
|
43
|
+
capitalize(preclean.(string)).split.map do |word|
|
44
|
+
count += 1
|
45
|
+
case word.downcase
|
46
|
+
when /[\w]\/[\w]/ # words with slashes
|
47
|
+
word.split('/').map { |w| capitalize(w) }.join(' / ')
|
48
|
+
when /[\w]\&[\w]/ # words with &, like E&J
|
49
|
+
word.split('&').map { |w| capitalize(w) }.join('&')
|
50
|
+
when /[\w]\-[\w]/ # words with dashes, like "Super-Cool"
|
51
|
+
word.split('-').map { |w| capitalize(w) }.join('-')
|
52
|
+
when /[\w]\.[\w]/ # words with dots, like "A.B.C."
|
53
|
+
word.split('.').map { |w| upcase(w) }.join('.') + '.'
|
54
|
+
when *SMALL_WORDS
|
55
|
+
1 == count ? word : word.downcase
|
56
|
+
when *ACRONYMS
|
57
|
+
word.upcase
|
58
|
+
else
|
59
|
+
word
|
90
60
|
end
|
91
|
-
end
|
92
|
-
phrases
|
61
|
+
end.join(' ').gsub(/(['’])S\b/, '\1s')
|
93
62
|
end
|
94
63
|
|
95
64
|
end
|
data/lib/lcbo/version.rb
CHANGED
@@ -8,7 +8,7 @@ describe LCBO::CrawlKit::TitleCaseHelper do
|
|
8
8
|
'MONDAVI TO-KALON FUMÉ BLANC' => 'Mondavi To-Kalon Fumé Blanc',
|
9
9
|
'ÉVE PICARD' => 'Éve Picard',
|
10
10
|
'R. PHILLIPS NIGHT HARVEST SHIRAZ' => 'R. Phillips Night Harvest Shiraz',
|
11
|
-
'02 OPUS ONE NAPA VALLEY C.V.B.G' => '02 Opus One Napa Valley C.V.B.G',
|
11
|
+
'02 OPUS ONE NAPA VALLEY C.V.B.G' => '02 Opus One Napa Valley C.V.B.G.',
|
12
12
|
'LONDON XXX' => 'London XXX',
|
13
13
|
'SOME NICE VQA WINE' => 'Some Nice VQA Wine',
|
14
14
|
'A PRODUCT NAME (WITH STUPID CRAP' => 'A Product Name',
|
@@ -21,10 +21,4 @@ describe LCBO::CrawlKit::TitleCaseHelper do
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
it 'should translate lowercase characters to uppercase characters properly' do
|
25
|
-
upper = LCBO::CrawlKit::TitleCaseHelper::UPPER_CHARS
|
26
|
-
lower = LCBO::CrawlKit::TitleCaseHelper::LOWER_CHARS
|
27
|
-
LCBO::CrawlKit::TitleCaseHelper.upcase(lower).must_equal upper
|
28
|
-
end
|
29
|
-
|
30
24
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 9
|
8
|
-
-
|
9
|
-
version: 0.9.
|
8
|
+
- 7
|
9
|
+
version: 0.9.7
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Carsten Nielsen
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-02 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -43,6 +43,19 @@ dependencies:
|
|
43
43
|
version: "0"
|
44
44
|
type: :runtime
|
45
45
|
version_requirements: *id002
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: unicode_utils
|
48
|
+
prerelease: false
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
type: :runtime
|
58
|
+
version_requirements: *id003
|
46
59
|
description: Request and parse product, store, inventory, and product search pages directly from the official LCBO website.
|
47
60
|
email:
|
48
61
|
- heycarsten@gmail.com
|