lcbo 0.9.6 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +4 -0
- data/Gemfile +1 -0
- data/Gemfile.lock +2 -0
- data/lcbo.gemspec +1 -0
- data/lib/lcbo/crawlkit.rb +1 -0
- data/lib/lcbo/crawlkit/response.rb +3 -3
- data/lib/lcbo/crawlkit/titlecase_helper.rb +39 -70
- data/lib/lcbo/version.rb +1 -1
- data/spec/crawlkit/titlecase_helper_spec.rb +1 -7
- metadata +16 -3
data/CHANGELOG.md
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
data/lcbo.gemspec
CHANGED
data/lib/lcbo/crawlkit.rb
CHANGED
@@ -18,10 +18,10 @@ module LCBO
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def self.normalize_encoding(html)
|
21
|
-
if html.valid_encoding?
|
22
|
-
html
|
23
|
-
else
|
21
|
+
if html.force_encoding('ISO-8859-1').valid_encoding?
|
24
22
|
html.encode('UTF-8', 'ISO-8859-1')
|
23
|
+
else
|
24
|
+
html.encode('UTF-8')
|
25
25
|
end.gsub("\r\n", "\n")
|
26
26
|
end
|
27
27
|
|
@@ -4,92 +4,61 @@ module LCBO
|
|
4
4
|
module CrawlKit
|
5
5
|
class TitleCaseHelper
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
7
|
+
SMALL_WORDS = %w[
|
8
|
+
a an and as at but by en for
|
9
|
+
if in of del de on or the to
|
10
|
+
v v. via vs vs.
|
11
|
+
]
|
12
|
+
|
13
|
+
ACRONYMS = %w[
|
14
|
+
vqa vsop xo nq5 vs xxx igt
|
15
|
+
xiii xi xoxo srl bdb cvbg
|
16
|
+
ocb lcbo i ii iii
|
17
|
+
]
|
16
18
|
|
17
19
|
attr_reader :input
|
18
20
|
|
19
21
|
def self.[](string)
|
20
|
-
|
22
|
+
titlecase(string)
|
21
23
|
end
|
22
24
|
|
23
25
|
def self.upcase(string)
|
24
|
-
|
26
|
+
UnicodeUtils.simple_upcase(string)
|
25
27
|
end
|
26
28
|
|
27
29
|
def self.downcase(string)
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
def self.preclean(string)
|
32
|
-
# Strip useless bracketed crap: Some Product Name (Some Redundant Stuff)**
|
33
|
-
string.gsub(/\(.+\Z/, '').
|
34
|
-
# Strip trailing stars.
|
35
|
-
gsub(/\*+\Z/, '')
|
30
|
+
UnicodeUtils.simple_downcase(string)
|
36
31
|
end
|
37
32
|
|
38
33
|
def self.capitalize(string)
|
39
|
-
|
40
|
-
if first_letter
|
41
|
-
uchar = upcase(first_letter)
|
42
|
-
string.sub(/#{first_letter}/u, uchar)
|
43
|
-
else
|
44
|
-
string
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def self.titleize(string)
|
49
|
-
phrases(preclean(downcase(string))).map do |phrase|
|
50
|
-
words = phrase.split
|
51
|
-
words.map do |word|
|
52
|
-
def word.capitalize
|
53
|
-
self.sub(ALPHA_RE) { |subword| TitleCaseHelper.capitalize(subword) }
|
54
|
-
end
|
55
|
-
case word
|
56
|
-
when *(ACRONYMS + ACRONYMS.map { |ac| capitalize(ac) })
|
57
|
-
upcase(word)
|
58
|
-
when /#{ALPHA_RANGE}\&#{ALPHA_RANGE}/u # words with &, like E&J
|
59
|
-
word.split(/\&/).map { |w| capitalize(w) }.join('&')
|
60
|
-
when /#{ALPHA_RANGE}\-#{ALPHA_RANGE}/u # words with dashes, like "Smith-Weston"
|
61
|
-
word.split(/\-/).map { |w| capitalize(w) }.join('-')
|
62
|
-
when /#{ALPHA_RANGE}\/#{ALPHA_RANGE}/u # words with slashes
|
63
|
-
word.split(/\//).map { |w| capitalize(w) }.join(' / ')
|
64
|
-
when /#{ALPHA_RANGE}\.#{ALPHA_RANGE}/u # words with dots, like "example.com"
|
65
|
-
capitalized = word.split(/\./u).map { |w| capitalize(w) }.join('.')
|
66
|
-
'.' == word[-1, 1] ? capitalized + '.' : capitalized
|
67
|
-
when /^#{ALPHA_RANGE}.*#{UPPER_RANGE}/u # non-first letter capitalized already
|
68
|
-
word
|
69
|
-
when words.first, words.last
|
70
|
-
word.capitalize
|
71
|
-
when *(SMALL_WORDS + SMALL_WORDS.map { |small| capitalize(small) })
|
72
|
-
word.downcase
|
73
|
-
else
|
74
|
-
word.capitalize
|
75
|
-
end
|
76
|
-
end.join(' ')
|
77
|
-
end.join(' ').
|
78
|
-
# Special case for Word'S
|
79
|
-
gsub(/(['’])S\b/, '\1s')
|
34
|
+
UnicodeUtils.titlecase(string)
|
80
35
|
end
|
81
36
|
|
82
|
-
def self.
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
37
|
+
def self.titlecase(string)
|
38
|
+
preclean = lambda { |s|
|
39
|
+
# Strip bracketed stuff and trailing junk: Product (Junk)**
|
40
|
+
s.gsub(/\(.+\Z/, '').gsub(/\*+\Z/, '').strip
|
41
|
+
}
|
42
|
+
count = 0 # Ewwww
|
43
|
+
capitalize(preclean.(string)).split.map do |word|
|
44
|
+
count += 1
|
45
|
+
case word.downcase
|
46
|
+
when /[\w]\/[\w]/ # words with slashes
|
47
|
+
word.split('/').map { |w| capitalize(w) }.join(' / ')
|
48
|
+
when /[\w]\&[\w]/ # words with &, like E&J
|
49
|
+
word.split('&').map { |w| capitalize(w) }.join('&')
|
50
|
+
when /[\w]\-[\w]/ # words with dashes, like "Super-Cool"
|
51
|
+
word.split('-').map { |w| capitalize(w) }.join('-')
|
52
|
+
when /[\w]\.[\w]/ # words with dots, like "A.B.C."
|
53
|
+
word.split('.').map { |w| upcase(w) }.join('.') + '.'
|
54
|
+
when *SMALL_WORDS
|
55
|
+
1 == count ? word : word.downcase
|
56
|
+
when *ACRONYMS
|
57
|
+
word.upcase
|
58
|
+
else
|
59
|
+
word
|
90
60
|
end
|
91
|
-
end
|
92
|
-
phrases
|
61
|
+
end.join(' ').gsub(/(['’])S\b/, '\1s')
|
93
62
|
end
|
94
63
|
|
95
64
|
end
|
data/lib/lcbo/version.rb
CHANGED
@@ -8,7 +8,7 @@ describe LCBO::CrawlKit::TitleCaseHelper do
|
|
8
8
|
'MONDAVI TO-KALON FUMÉ BLANC' => 'Mondavi To-Kalon Fumé Blanc',
|
9
9
|
'ÉVE PICARD' => 'Éve Picard',
|
10
10
|
'R. PHILLIPS NIGHT HARVEST SHIRAZ' => 'R. Phillips Night Harvest Shiraz',
|
11
|
-
'02 OPUS ONE NAPA VALLEY C.V.B.G' => '02 Opus One Napa Valley C.V.B.G',
|
11
|
+
'02 OPUS ONE NAPA VALLEY C.V.B.G' => '02 Opus One Napa Valley C.V.B.G.',
|
12
12
|
'LONDON XXX' => 'London XXX',
|
13
13
|
'SOME NICE VQA WINE' => 'Some Nice VQA Wine',
|
14
14
|
'A PRODUCT NAME (WITH STUPID CRAP' => 'A Product Name',
|
@@ -21,10 +21,4 @@ describe LCBO::CrawlKit::TitleCaseHelper do
|
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
it 'should translate lowercase characters to uppercase characters properly' do
|
25
|
-
upper = LCBO::CrawlKit::TitleCaseHelper::UPPER_CHARS
|
26
|
-
lower = LCBO::CrawlKit::TitleCaseHelper::LOWER_CHARS
|
27
|
-
LCBO::CrawlKit::TitleCaseHelper.upcase(lower).must_equal upper
|
28
|
-
end
|
29
|
-
|
30
24
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 9
|
8
|
-
-
|
9
|
-
version: 0.9.
|
8
|
+
- 7
|
9
|
+
version: 0.9.7
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Carsten Nielsen
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-02 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -43,6 +43,19 @@ dependencies:
|
|
43
43
|
version: "0"
|
44
44
|
type: :runtime
|
45
45
|
version_requirements: *id002
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: unicode_utils
|
48
|
+
prerelease: false
|
49
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
segments:
|
55
|
+
- 0
|
56
|
+
version: "0"
|
57
|
+
type: :runtime
|
58
|
+
version_requirements: *id003
|
46
59
|
description: Request and parse product, store, inventory, and product search pages directly from the official LCBO website.
|
47
60
|
email:
|
48
61
|
- heycarsten@gmail.com
|