lcbo 0.9.6 → 0.9.7

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ Version 0.9.7
2
+
3
+ * Refactored `TitleCaseHelper` to use the UnicodeUtils library.
4
+
1
5
  Version 0.9.6
2
6
 
3
7
  * Removed instances of `Enumerable#reduce` in favour of `Hash[]` for
data/Gemfile CHANGED
@@ -2,3 +2,4 @@ source 'http://rubygems.org'
2
2
 
3
3
  gem 'typhoeus'
4
4
  gem 'nokogiri'
5
+ gem 'unicode_utils'
data/Gemfile.lock CHANGED
@@ -5,6 +5,7 @@ GEM
5
5
  rack (1.2.1)
6
6
  typhoeus (0.1.31)
7
7
  rack
8
+ unicode_utils (1.0.0)
8
9
 
9
10
  PLATFORMS
10
11
  ruby
@@ -12,3 +13,4 @@ PLATFORMS
12
13
  DEPENDENCIES
13
14
  nokogiri
14
15
  typhoeus
16
+ unicode_utils
data/lcbo.gemspec CHANGED
@@ -16,6 +16,7 @@ Gem::Specification.new do |s|
16
16
 
17
17
  s.add_dependency 'typhoeus'
18
18
  s.add_dependency 'nokogiri'
19
+ s.add_dependency 'unicode_utils'
19
20
 
20
21
  s.files = `git ls-files`.split(?\n)
21
22
  s.test_files = `git ls-files -- {test,spec}/*`.split(?\n)
data/lib/lcbo/crawlkit.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'typhoeus'
3
+ require 'unicode_utils'
3
4
  require 'uri'
4
5
 
5
6
  module LCBO
@@ -18,10 +18,10 @@ module LCBO
18
18
  end
19
19
 
20
20
  def self.normalize_encoding(html)
21
- if html.valid_encoding?
22
- html
23
- else
21
+ if html.force_encoding('ISO-8859-1').valid_encoding?
24
22
  html.encode('UTF-8', 'ISO-8859-1')
23
+ else
24
+ html.encode('UTF-8')
25
25
  end.gsub("\r\n", "\n")
26
26
  end
27
27
 
@@ -4,92 +4,61 @@ module LCBO
4
4
  module CrawlKit
5
5
  class TitleCaseHelper
6
6
 
7
- UPPER_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝ'
8
- LOWER_CHARS = 'abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüý'
9
- ALPHA_RANGE = "[#{UPPER_CHARS}#{LOWER_CHARS}]"
10
- UPPER_RANGE = "[#{UPPER_CHARS}]"
11
- LOWER_RANGE = "[#{LOWER_CHARS}]"
12
- FIRST_CHAR_RE = /#{ALPHA_RANGE}{1}/u
13
- ALPHA_RE = /#{ALPHA_RANGE}.*/u
14
- SMALL_WORDS = %w[ a an and as at but by en for if in of del de on or the to v v. via vs vs. ]
15
- ACRONYMS = %w[ vqa vsop xo nq5 vs xxx igt xiii xi xoxo srl bdb cvbg ocb lcbo i ii iii ]
7
+ SMALL_WORDS = %w[
8
+ a an and as at but by en for
9
+ if in of del de on or the to
10
+ v v. via vs vs.
11
+ ]
12
+
13
+ ACRONYMS = %w[
14
+ vqa vsop xo nq5 vs xxx igt
15
+ xiii xi xoxo srl bdb cvbg
16
+ ocb lcbo i ii iii
17
+ ]
16
18
 
17
19
  attr_reader :input
18
20
 
19
21
  def self.[](string)
20
- titleize(string)
22
+ titlecase(string)
21
23
  end
22
24
 
23
25
  def self.upcase(string)
24
- string.tr(LOWER_CHARS, UPPER_CHARS)
26
+ UnicodeUtils.simple_upcase(string)
25
27
  end
26
28
 
27
29
  def self.downcase(string)
28
- string.tr(UPPER_CHARS, LOWER_CHARS)
29
- end
30
-
31
- def self.preclean(string)
32
- # Strip useless bracketed crap: Some Product Name (Some Redundant Stuff)**
33
- string.gsub(/\(.+\Z/, '').
34
- # Strip trailing stars.
35
- gsub(/\*+\Z/, '')
30
+ UnicodeUtils.simple_downcase(string)
36
31
  end
37
32
 
38
33
  def self.capitalize(string)
39
- first_letter = string.scan(FIRST_CHAR_RE)[0]
40
- if first_letter
41
- uchar = upcase(first_letter)
42
- string.sub(/#{first_letter}/u, uchar)
43
- else
44
- string
45
- end
46
- end
47
-
48
- def self.titleize(string)
49
- phrases(preclean(downcase(string))).map do |phrase|
50
- words = phrase.split
51
- words.map do |word|
52
- def word.capitalize
53
- self.sub(ALPHA_RE) { |subword| TitleCaseHelper.capitalize(subword) }
54
- end
55
- case word
56
- when *(ACRONYMS + ACRONYMS.map { |ac| capitalize(ac) })
57
- upcase(word)
58
- when /#{ALPHA_RANGE}\&#{ALPHA_RANGE}/u # words with &, like E&J
59
- word.split(/\&/).map { |w| capitalize(w) }.join('&')
60
- when /#{ALPHA_RANGE}\-#{ALPHA_RANGE}/u # words with dashes, like "Smith-Weston"
61
- word.split(/\-/).map { |w| capitalize(w) }.join('-')
62
- when /#{ALPHA_RANGE}\/#{ALPHA_RANGE}/u # words with slashes
63
- word.split(/\//).map { |w| capitalize(w) }.join(' / ')
64
- when /#{ALPHA_RANGE}\.#{ALPHA_RANGE}/u # words with dots, like "example.com"
65
- capitalized = word.split(/\./u).map { |w| capitalize(w) }.join('.')
66
- '.' == word[-1, 1] ? capitalized + '.' : capitalized
67
- when /^#{ALPHA_RANGE}.*#{UPPER_RANGE}/u # non-first letter capitalized already
68
- word
69
- when words.first, words.last
70
- word.capitalize
71
- when *(SMALL_WORDS + SMALL_WORDS.map { |small| capitalize(small) })
72
- word.downcase
73
- else
74
- word.capitalize
75
- end
76
- end.join(' ')
77
- end.join(' ').
78
- # Special case for Word'S
79
- gsub(/(['’])S\b/, '\1s')
34
+ UnicodeUtils.titlecase(string)
80
35
  end
81
36
 
82
- def self.phrases(title)
83
- phrases = title.scan(/.+?(?:[:.;?!] |$)/u).map { |phrase| phrase.strip }
84
- # rejoin phrases that were split on the '.' from a small word
85
- if phrases.size > 1
86
- phrases[0..-2].each_with_index do |phrase, index|
87
- if SMALL_WORDS.include?(phrase.split.last.downcase)
88
- phrases[index] << " " + phrases.slice!(index + 1)
89
- end
37
+ def self.titlecase(string)
38
+ preclean = lambda { |s|
39
+ # Strip bracketed stuff and trailing junk: Product (Junk)**
40
+ s.gsub(/\(.+\Z/, '').gsub(/\*+\Z/, '').strip
41
+ }
42
+ count = 0 # Ewwww
43
+ capitalize(preclean.(string)).split.map do |word|
44
+ count += 1
45
+ case word.downcase
46
+ when /[\w]\/[\w]/ # words with slashes
47
+ word.split('/').map { |w| capitalize(w) }.join(' / ')
48
+ when /[\w]\&[\w]/ # words with &, like E&J
49
+ word.split('&').map { |w| capitalize(w) }.join('&')
50
+ when /[\w]\-[\w]/ # words with dashes, like "Super-Cool"
51
+ word.split('-').map { |w| capitalize(w) }.join('-')
52
+ when /[\w]\.[\w]/ # words with dots, like "A.B.C."
53
+ word.split('.').map { |w| upcase(w) }.join('.') + '.'
54
+ when *SMALL_WORDS
55
+ 1 == count ? word : word.downcase
56
+ when *ACRONYMS
57
+ word.upcase
58
+ else
59
+ word
90
60
  end
91
- end
92
- phrases
61
+ end.join(' ').gsub(/(['’])S\b/, '\1s')
93
62
  end
94
63
 
95
64
  end
data/lib/lcbo/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module LCBO
2
- VERSION = '0.9.6'
2
+ VERSION = '0.9.7'
3
3
  end
@@ -8,7 +8,7 @@ describe LCBO::CrawlKit::TitleCaseHelper do
8
8
  'MONDAVI TO-KALON FUMÉ BLANC' => 'Mondavi To-Kalon Fumé Blanc',
9
9
  'ÉVE PICARD' => 'Éve Picard',
10
10
  'R. PHILLIPS NIGHT HARVEST SHIRAZ' => 'R. Phillips Night Harvest Shiraz',
11
- '02 OPUS ONE NAPA VALLEY C.V.B.G' => '02 Opus One Napa Valley C.V.B.G',
11
+ '02 OPUS ONE NAPA VALLEY C.V.B.G' => '02 Opus One Napa Valley C.V.B.G.',
12
12
  'LONDON XXX' => 'London XXX',
13
13
  'SOME NICE VQA WINE' => 'Some Nice VQA Wine',
14
14
  'A PRODUCT NAME (WITH STUPID CRAP' => 'A Product Name',
@@ -21,10 +21,4 @@ describe LCBO::CrawlKit::TitleCaseHelper do
21
21
  end
22
22
  end
23
23
 
24
- it 'should translate lowercase characters to uppercase characters properly' do
25
- upper = LCBO::CrawlKit::TitleCaseHelper::UPPER_CHARS
26
- lower = LCBO::CrawlKit::TitleCaseHelper::LOWER_CHARS
27
- LCBO::CrawlKit::TitleCaseHelper.upcase(lower).must_equal upper
28
- end
29
-
30
24
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 9
8
- - 6
9
- version: 0.9.6
8
+ - 7
9
+ version: 0.9.7
10
10
  platform: ruby
11
11
  authors:
12
12
  - Carsten Nielsen
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-12-01 00:00:00 -05:00
17
+ date: 2010-12-02 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -43,6 +43,19 @@ dependencies:
43
43
  version: "0"
44
44
  type: :runtime
45
45
  version_requirements: *id002
46
+ - !ruby/object:Gem::Dependency
47
+ name: unicode_utils
48
+ prerelease: false
49
+ requirement: &id003 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ segments:
55
+ - 0
56
+ version: "0"
57
+ type: :runtime
58
+ version_requirements: *id003
46
59
  description: Request and parse product, store, inventory, and product search pages directly from the official LCBO website.
47
60
  email:
48
61
  - heycarsten@gmail.com