lcbo 0.9.6 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ Version 0.9.7
2
+
3
+ * Refactored `TitleCaseHelper` to use the UnicodeUtils library.
4
+
1
5
  Version 0.9.6
2
6
 
3
7
  * Removed instances of `Enumerable#reduce` in favour of `Hash[]` for
data/Gemfile CHANGED
@@ -2,3 +2,4 @@ source 'http://rubygems.org'
2
2
 
3
3
  gem 'typhoeus'
4
4
  gem 'nokogiri'
5
+ gem 'unicode_utils'
data/Gemfile.lock CHANGED
@@ -5,6 +5,7 @@ GEM
5
5
  rack (1.2.1)
6
6
  typhoeus (0.1.31)
7
7
  rack
8
+ unicode_utils (1.0.0)
8
9
 
9
10
  PLATFORMS
10
11
  ruby
@@ -12,3 +13,4 @@ PLATFORMS
12
13
  DEPENDENCIES
13
14
  nokogiri
14
15
  typhoeus
16
+ unicode_utils
data/lcbo.gemspec CHANGED
@@ -16,6 +16,7 @@ Gem::Specification.new do |s|
16
16
 
17
17
  s.add_dependency 'typhoeus'
18
18
  s.add_dependency 'nokogiri'
19
+ s.add_dependency 'unicode_utils'
19
20
 
20
21
  s.files = `git ls-files`.split(?\n)
21
22
  s.test_files = `git ls-files -- {test,spec}/*`.split(?\n)
data/lib/lcbo/crawlkit.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'typhoeus'
3
+ require 'unicode_utils'
3
4
  require 'uri'
4
5
 
5
6
  module LCBO
@@ -18,10 +18,10 @@ module LCBO
18
18
  end
19
19
 
20
20
  def self.normalize_encoding(html)
21
- if html.valid_encoding?
22
- html
23
- else
21
+ if html.force_encoding('ISO-8859-1').valid_encoding?
24
22
  html.encode('UTF-8', 'ISO-8859-1')
23
+ else
24
+ html.encode('UTF-8')
25
25
  end.gsub("\r\n", "\n")
26
26
  end
27
27
 
@@ -4,92 +4,61 @@ module LCBO
4
4
  module CrawlKit
5
5
  class TitleCaseHelper
6
6
 
7
- UPPER_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝ'
8
- LOWER_CHARS = 'abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüý'
9
- ALPHA_RANGE = "[#{UPPER_CHARS}#{LOWER_CHARS}]"
10
- UPPER_RANGE = "[#{UPPER_CHARS}]"
11
- LOWER_RANGE = "[#{LOWER_CHARS}]"
12
- FIRST_CHAR_RE = /#{ALPHA_RANGE}{1}/u
13
- ALPHA_RE = /#{ALPHA_RANGE}.*/u
14
- SMALL_WORDS = %w[ a an and as at but by en for if in of del de on or the to v v. via vs vs. ]
15
- ACRONYMS = %w[ vqa vsop xo nq5 vs xxx igt xiii xi xoxo srl bdb cvbg ocb lcbo i ii iii ]
7
+ SMALL_WORDS = %w[
8
+ a an and as at but by en for
9
+ if in of del de on or the to
10
+ v v. via vs vs.
11
+ ]
12
+
13
+ ACRONYMS = %w[
14
+ vqa vsop xo nq5 vs xxx igt
15
+ xiii xi xoxo srl bdb cvbg
16
+ ocb lcbo i ii iii
17
+ ]
16
18
 
17
19
  attr_reader :input
18
20
 
19
21
  def self.[](string)
20
- titleize(string)
22
+ titlecase(string)
21
23
  end
22
24
 
23
25
  def self.upcase(string)
24
- string.tr(LOWER_CHARS, UPPER_CHARS)
26
+ UnicodeUtils.simple_upcase(string)
25
27
  end
26
28
 
27
29
  def self.downcase(string)
28
- string.tr(UPPER_CHARS, LOWER_CHARS)
29
- end
30
-
31
- def self.preclean(string)
32
- # Strip useless bracketed crap: Some Product Name (Some Redundant Stuff)**
33
- string.gsub(/\(.+\Z/, '').
34
- # Strip trailing stars.
35
- gsub(/\*+\Z/, '')
30
+ UnicodeUtils.simple_downcase(string)
36
31
  end
37
32
 
38
33
  def self.capitalize(string)
39
- first_letter = string.scan(FIRST_CHAR_RE)[0]
40
- if first_letter
41
- uchar = upcase(first_letter)
42
- string.sub(/#{first_letter}/u, uchar)
43
- else
44
- string
45
- end
46
- end
47
-
48
- def self.titleize(string)
49
- phrases(preclean(downcase(string))).map do |phrase|
50
- words = phrase.split
51
- words.map do |word|
52
- def word.capitalize
53
- self.sub(ALPHA_RE) { |subword| TitleCaseHelper.capitalize(subword) }
54
- end
55
- case word
56
- when *(ACRONYMS + ACRONYMS.map { |ac| capitalize(ac) })
57
- upcase(word)
58
- when /#{ALPHA_RANGE}\&#{ALPHA_RANGE}/u # words with &, like E&J
59
- word.split(/\&/).map { |w| capitalize(w) }.join('&')
60
- when /#{ALPHA_RANGE}\-#{ALPHA_RANGE}/u # words with dashes, like "Smith-Weston"
61
- word.split(/\-/).map { |w| capitalize(w) }.join('-')
62
- when /#{ALPHA_RANGE}\/#{ALPHA_RANGE}/u # words with slashes
63
- word.split(/\//).map { |w| capitalize(w) }.join(' / ')
64
- when /#{ALPHA_RANGE}\.#{ALPHA_RANGE}/u # words with dots, like "example.com"
65
- capitalized = word.split(/\./u).map { |w| capitalize(w) }.join('.')
66
- '.' == word[-1, 1] ? capitalized + '.' : capitalized
67
- when /^#{ALPHA_RANGE}.*#{UPPER_RANGE}/u # non-first letter capitalized already
68
- word
69
- when words.first, words.last
70
- word.capitalize
71
- when *(SMALL_WORDS + SMALL_WORDS.map { |small| capitalize(small) })
72
- word.downcase
73
- else
74
- word.capitalize
75
- end
76
- end.join(' ')
77
- end.join(' ').
78
- # Special case for Word'S
79
- gsub(/(['’])S\b/, '\1s')
34
+ UnicodeUtils.titlecase(string)
80
35
  end
81
36
 
82
- def self.phrases(title)
83
- phrases = title.scan(/.+?(?:[:.;?!] |$)/u).map { |phrase| phrase.strip }
84
- # rejoin phrases that were split on the '.' from a small word
85
- if phrases.size > 1
86
- phrases[0..-2].each_with_index do |phrase, index|
87
- if SMALL_WORDS.include?(phrase.split.last.downcase)
88
- phrases[index] << " " + phrases.slice!(index + 1)
89
- end
37
+ def self.titlecase(string)
38
+ preclean = lambda { |s|
39
+ # Strip bracketed stuff and trailing junk: Product (Junk)**
40
+ s.gsub(/\(.+\Z/, '').gsub(/\*+\Z/, '').strip
41
+ }
42
+ count = 0 # Ewwww
43
+ capitalize(preclean.(string)).split.map do |word|
44
+ count += 1
45
+ case word.downcase
46
+ when /[\w]\/[\w]/ # words with slashes
47
+ word.split('/').map { |w| capitalize(w) }.join(' / ')
48
+ when /[\w]\&[\w]/ # words with &, like E&J
49
+ word.split('&').map { |w| capitalize(w) }.join('&')
50
+ when /[\w]\-[\w]/ # words with dashes, like "Super-Cool"
51
+ word.split('-').map { |w| capitalize(w) }.join('-')
52
+ when /[\w]\.[\w]/ # words with dots, like "A.B.C."
53
+ word.split('.').map { |w| upcase(w) }.join('.') + '.'
54
+ when *SMALL_WORDS
55
+ 1 == count ? word : word.downcase
56
+ when *ACRONYMS
57
+ word.upcase
58
+ else
59
+ word
90
60
  end
91
- end
92
- phrases
61
+ end.join(' ').gsub(/(['’])S\b/, '\1s')
93
62
  end
94
63
 
95
64
  end
data/lib/lcbo/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module LCBO
2
- VERSION = '0.9.6'
2
+ VERSION = '0.9.7'
3
3
  end
@@ -8,7 +8,7 @@ describe LCBO::CrawlKit::TitleCaseHelper do
8
8
  'MONDAVI TO-KALON FUMÉ BLANC' => 'Mondavi To-Kalon Fumé Blanc',
9
9
  'ÉVE PICARD' => 'Éve Picard',
10
10
  'R. PHILLIPS NIGHT HARVEST SHIRAZ' => 'R. Phillips Night Harvest Shiraz',
11
- '02 OPUS ONE NAPA VALLEY C.V.B.G' => '02 Opus One Napa Valley C.V.B.G',
11
+ '02 OPUS ONE NAPA VALLEY C.V.B.G' => '02 Opus One Napa Valley C.V.B.G.',
12
12
  'LONDON XXX' => 'London XXX',
13
13
  'SOME NICE VQA WINE' => 'Some Nice VQA Wine',
14
14
  'A PRODUCT NAME (WITH STUPID CRAP' => 'A Product Name',
@@ -21,10 +21,4 @@ describe LCBO::CrawlKit::TitleCaseHelper do
21
21
  end
22
22
  end
23
23
 
24
- it 'should translate lowercase characters to uppercase characters properly' do
25
- upper = LCBO::CrawlKit::TitleCaseHelper::UPPER_CHARS
26
- lower = LCBO::CrawlKit::TitleCaseHelper::LOWER_CHARS
27
- LCBO::CrawlKit::TitleCaseHelper.upcase(lower).must_equal upper
28
- end
29
-
30
24
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 9
8
- - 6
9
- version: 0.9.6
8
+ - 7
9
+ version: 0.9.7
10
10
  platform: ruby
11
11
  authors:
12
12
  - Carsten Nielsen
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-12-01 00:00:00 -05:00
17
+ date: 2010-12-02 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -43,6 +43,19 @@ dependencies:
43
43
  version: "0"
44
44
  type: :runtime
45
45
  version_requirements: *id002
46
+ - !ruby/object:Gem::Dependency
47
+ name: unicode_utils
48
+ prerelease: false
49
+ requirement: &id003 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ type: :runtime
58
+ version_requirements: *id003
46
59
  description: Request and parse product, store, inventory, and product search pages directly from the official LCBO website.
47
60
  email:
48
61
  - heycarsten@gmail.com