quandl_babelfish 0.0.10 → 0.0.11

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,7 +1,7 @@
1
- .idea/
2
- /Gemfile.lock
3
- /pkg
4
- /tmp
5
- .rvmrc
6
- *.gem
7
- *.log
1
+ .idea/
2
+ /Gemfile.lock
3
+ /pkg
4
+ /tmp
5
+ .rvmrc
6
+ *.gem
7
+ *.log
@@ -1,12 +1,12 @@
1
- language: ruby
2
-
3
- rvm:
4
- - 2.0.0
5
- - 1.9.3
6
-
7
- gemfile:
8
- - Gemfile
9
-
10
- matrix:
11
-
12
- script: bundle exec rspec
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.0.0
5
+ - 1.9.3
6
+
7
+ gemfile:
8
+ - Gemfile
9
+
10
+ matrix:
11
+
12
+ script: bundle exec rspec
data/Gemfile CHANGED
@@ -1,2 +1,2 @@
1
- source "https://rubygems.org"
1
+ source "https://rubygems.org"
2
2
  gemspec
data/LICENSE CHANGED
@@ -1,7 +1,7 @@
1
- Copyright (c) 2012-2013 Quandl
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
-
5
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
-
7
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1
+ Copyright (c) 2012-2013 Quandl
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -1,18 +1,18 @@
1
- # Quandl::Babelfish
2
-
3
- ### Installation
4
-
5
- ```ruby
6
-
7
-
8
- gem 'quandl_babelfish'
9
-
10
- ```
11
-
12
-
13
- ### Usage
14
-
15
- ```ruby
16
-
17
-
18
- ```
1
+ # Quandl::Babelfish
2
+
3
+ ### Installation
4
+
5
+ ```ruby
6
+
7
+
8
+ gem 'quandl_babelfish'
9
+
10
+ ```
11
+
12
+
13
+ ### Usage
14
+
15
+ ```ruby
16
+
17
+
18
+ ```
data/Rakefile CHANGED
@@ -1,8 +1,8 @@
1
- require "bundler"
2
- require "rake"
3
- require "bundler/gem_tasks"
4
-
5
- $:.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
6
-
7
- require 'pry'
1
+ require "bundler"
2
+ require "rake"
3
+ require "bundler/gem_tasks"
4
+
5
+ $:.unshift File.join(File.dirname(__FILE__), *%w[.. lib])
6
+
7
+ require 'pry'
8
8
  require "quandl/babelfish"
data/UPGRADE.md CHANGED
@@ -1,44 +1,47 @@
1
- ## 0.0.9
2
-
3
- * QUGC-54 write failing spec for input with nil row value
4
-
5
-
6
- ## 0.0.8
7
-
8
- * QUGC-42 you should not have to include data; you might just want to update headers
9
-
10
-
11
- ## 0.0.7
12
-
13
- * added squaring final data
14
-
15
-
16
- ## 0.0.6
17
-
18
- * add Babelfish::Chronometer
19
- * add Babelfish.guess_frequency
20
-
21
- ## 0.0.5
22
-
23
- * improve error messages with line, row, context
24
- * add Quandl::Error::Standard, all errors inherit from Error::Standard
25
-
26
-
27
- ## 0.0.4
28
-
29
- * remove quandl_data as a dependency
30
-
31
-
32
- ## 0.0.3
33
-
34
- * Add Quandl::Data as a add_runtime_dependency
35
- * refactor Babelfish::Data to inherit from Quandl::Data
36
- * refactor specs
37
-
38
-
39
- ## 0.0.1
40
-
41
- * replace Cleaner.process return clean_array, header with Quandl::Babelfish::Data.new( clean_array, headers: header )
42
- * refactored error
43
- * added header extraction support
1
+ ## 0.0.11
2
+ * Fixed US Dates
3
+
4
+ ## 0.0.9
5
+
6
+ * QUGC-54 write failing spec for input with nil row value
7
+
8
+
9
+ ## 0.0.8
10
+
11
+ * QUGC-42 you should not have to include data; you might just want to update headers
12
+
13
+
14
+ ## 0.0.7
15
+
16
+ * added squaring final data
17
+
18
+
19
+ ## 0.0.6
20
+
21
+ * add Babelfish::Chronometer
22
+ * add Babelfish.guess_frequency
23
+
24
+ ## 0.0.5
25
+
26
+ * improve error messages with line, row, context
27
+ * add Quandl::Error::Standard, all errors inherit from Error::Standard
28
+
29
+
30
+ ## 0.0.4
31
+
32
+ * remove quandl_data as a dependency
33
+
34
+
35
+ ## 0.0.3
36
+
37
+ * Add Quandl::Data as a add_runtime_dependency
38
+ * refactor Babelfish::Data to inherit from Quandl::Data
39
+ * refactor specs
40
+
41
+
42
+ ## 0.0.1
43
+
44
+ * replace Cleaner.process return clean_array, header with Quandl::Babelfish::Data.new( clean_array, headers: header )
45
+ * refactored error
46
+ * added header extraction support
44
47
  * init
@@ -1,29 +1,29 @@
1
- require "quandl/babelfish/version"
2
-
3
- require "quandl/babelfish/helper"
4
- require "quandl/babelfish/cleaner"
5
- require "quandl/babelfish/date_maid"
6
- require "quandl/babelfish/number_maid"
7
- require "quandl/babelfish/chronometer"
8
-
9
- require 'quandl/error/standard'
10
- require 'quandl/error/guess_date_format'
11
- require 'quandl/error/invalid_date'
12
- require 'quandl/error/unknown_date_format'
13
-
14
- module Quandl
15
- module Babelfish
16
-
17
- class << self
18
- def clean(data, date_settings={}, number_settings={})
19
- Cleaner::process data, date_settings, number_settings
20
- end
21
-
22
- def guess_frequency(data)
23
- Chronometer::process data
24
- end
25
-
26
- end
27
-
28
- end
1
+ require "quandl/babelfish/version"
2
+
3
+ require "quandl/babelfish/helper"
4
+ require "quandl/babelfish/cleaner"
5
+ require "quandl/babelfish/date_maid"
6
+ require "quandl/babelfish/number_maid"
7
+ require "quandl/babelfish/chronometer"
8
+
9
+ require 'quandl/error/standard'
10
+ require 'quandl/error/guess_date_format'
11
+ require 'quandl/error/invalid_date'
12
+ require 'quandl/error/unknown_date_format'
13
+
14
+ module Quandl
15
+ module Babelfish
16
+
17
+ class << self
18
+ def clean(data, date_settings={}, number_settings={})
19
+ Cleaner::process data, date_settings, number_settings
20
+ end
21
+
22
+ def guess_frequency(data)
23
+ Chronometer::process data
24
+ end
25
+
26
+ end
27
+
28
+ end
29
29
  end
@@ -1,44 +1,44 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- class Chronometer
5
- class << self
6
-
7
- #return frequency and warning message if present
8
- def process(table)
9
- # guesses date frequency in a table
10
- return nil if table.nil? || table.size==0
11
- return 'daily' if table.size==1 #not enough , need more points
12
- freqs = []
13
- fmt = "%Y-%m"
14
- fmt = "%Y" if table[0][0].to_s !~ /-/
15
- fmt = "%Y-%m-%d" if table[0][0].to_s =~ /^.*-.*-.*$/
16
-
17
- table.each_index do |r|
18
- break if r==6 #first 6 record is enough to analyze
19
- if table[r+1].nil?
20
- break
21
- else
22
- diff = (Date.strptime(table[r+1][0].to_s, fmt) -
23
- Date.strptime(table[r][0].to_s, fmt)).to_i.abs
24
- if diff < 4
25
- freqs << 'daily'
26
- elsif diff < 10
27
- freqs << 'weekly'
28
- elsif diff < 60
29
- freqs << 'monthly'
30
- elsif diff < 200
31
- freqs << 'quarterly'
32
- else
33
- freqs << 'annual'
34
- end
35
- end
36
- end
37
- return freqs.sort_by { |e| freqs.count(e) }.reverse.first#, nil
38
- end
39
-
40
- end
41
- end
42
-
43
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ class Chronometer
5
+ class << self
6
+
7
+ #return frequency and warning message if present
8
+ def process(table)
9
+ # guesses date frequency in a table
10
+ return nil if table.nil? || table.size==0
11
+ return 'daily' if table.size==1 #not enough , need more points
12
+ freqs = []
13
+ fmt = "%Y-%m"
14
+ fmt = "%Y" if table[0][0].to_s !~ /-/
15
+ fmt = "%Y-%m-%d" if table[0][0].to_s =~ /^.*-.*-.*$/
16
+
17
+ table.each_index do |r|
18
+ break if r==6 #first 6 record is enough to analyze
19
+ if table[r+1].nil?
20
+ break
21
+ else
22
+ diff = (Date.strptime(table[r+1][0].to_s, fmt) -
23
+ Date.strptime(table[r][0].to_s, fmt)).to_i.abs
24
+ if diff < 4
25
+ freqs << 'daily'
26
+ elsif diff < 10
27
+ freqs << 'weekly'
28
+ elsif diff < 60
29
+ freqs << 'monthly'
30
+ elsif diff < 200
31
+ freqs << 'quarterly'
32
+ else
33
+ freqs << 'annual'
34
+ end
35
+ end
36
+ end
37
+ return freqs.sort_by { |e| freqs.count(e) }.reverse.first#, nil
38
+ end
39
+
40
+ end
41
+ end
42
+
43
+ end
44
44
  end
@@ -1,36 +1,36 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- class Cleaner
5
- class << self
6
- def process(dirty_array, date_settings={}, number_settings={})
7
- return nil,nil if dirty_array.nil?
8
-
9
- #check if first line is header
10
- header=DateMaid::disinfect(dirty_array.shift) unless DateMaid::analyze_date_format(DateMaid::disinfect(dirty_array[0][0])[0])[0]
11
- return [], header if dirty_array.empty?
12
-
13
- #converts dates first
14
- dirty_array
15
- dates = dirty_array.collect{|x| x[0]}
16
- DateMaid::init(date_settings)
17
- clean_dates=DateMaid::sweep dates
18
-
19
- clean_array=[]
20
- #clean numbers later
21
- NumberMaid::init(number_settings)
22
- dirty_array.each.with_index do |row, i|
23
- new_row=[]
24
- clean_row=NumberMaid::clean(row[1..-1])
25
- clean_row=[nil] if clean_row.nil?
26
- (new_row << clean_dates[i]).concat Array(clean_row) #add clean date and all clean numbers
27
- clean_array << new_row
28
- end
29
-
30
- return clean_array, header
31
- end
32
- end
33
- end
34
-
35
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ class Cleaner
5
+ class << self
6
+ def process(dirty_array, date_settings={}, number_settings={})
7
+ return nil,nil if dirty_array.nil?
8
+
9
+ #check if first line is header
10
+ header=DateMaid::disinfect(dirty_array.shift) unless DateMaid::analyze_date_format(DateMaid::disinfect(dirty_array[0][0])[0])[0]
11
+ return [], header if dirty_array.empty?
12
+
13
+ #converts dates first
14
+ dirty_array
15
+ dates = dirty_array.collect{|x| x[0]}
16
+ DateMaid::init(date_settings)
17
+ clean_dates=DateMaid::sweep dates
18
+
19
+ clean_array=[]
20
+ #clean numbers later
21
+ NumberMaid::init(number_settings)
22
+ dirty_array.each.with_index do |row, i|
23
+ new_row=[]
24
+ clean_row=NumberMaid::clean(row[1..-1])
25
+ clean_row=[nil] if clean_row.nil?
26
+ (new_row << clean_dates[i]).concat Array(clean_row) #add clean date and all clean numbers
27
+ clean_array << new_row
28
+ end
29
+
30
+ return clean_array, header
31
+ end
32
+ end
33
+ end
34
+
35
+ end
36
36
  end
@@ -1,247 +1,247 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- #responsible for number formatting
5
- class DateMaid
6
- @defaults = {
7
- :format => nil
8
- }
9
-
10
- @settings = @defaults #init with defaults
11
-
12
- class << self
13
-
14
- def init(user_settings)
15
- @settings=@defaults.merge(user_settings)
16
- end
17
-
18
- #looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
19
- def sweep(all_dates)
20
- return nil if all_dates.nil?
21
-
22
- all_dates = disinfect all_dates
23
-
24
- if @settings[:format].nil?
25
- #find good example and extract all info from it and apply it to each of the dates in the set
26
- good_sample = find_good_date(all_dates)
27
-
28
- raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
29
-
30
- date_format, frequency = analyze_date_format(good_sample)
31
-
32
-
33
- else
34
- date_format = @settings[:format]
35
- end
36
-
37
- iso_dates=[]
38
- all_dates.each_with_index do |fuzzy_date, i|
39
- temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
40
- iso_dates << frequency_transform(temp_date, frequency)
41
- end
42
-
43
- iso_dates
44
- end
45
-
46
- def analyze_date_format(example)
47
- return nil if example.nil?
48
-
49
- # Regular formats and Custom formats (where Date.parse and Date.strptime
50
- # fear to tread)
51
- if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
52
- if re[2].to_i > 12
53
- return '%m-%d-%Y', nil
54
- else
55
- return '%d-%m-%Y', nil
56
- end
57
- end
58
- if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
59
- if re[1].to_i > 12
60
- return '%d-%m-%y', nil
61
- else
62
- return '%m-%d-%y', nil
63
- end
64
- end
65
- # order these guys from most specific to most general
66
- return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
67
- return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
68
- return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
69
- return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
70
- return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
71
- return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
72
- return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
73
- return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
74
- return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
75
- return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
76
- return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
77
- return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
78
- return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
79
- return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
80
- return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
81
- return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
82
- return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
83
- return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
84
- return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
85
- return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
86
- return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
87
- return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
88
- return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
89
-
90
- #our custom formats
91
- return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
92
- return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
93
- return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
94
-
95
- # No, try default date parse
96
- # raise PostProcessorException, "Unable to guess date format for #{example}"
97
- [nil, nil]
98
- end
99
-
100
- def disinfect(dates)
101
- [*dates].collect do |date|
102
- date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
103
- date.to_s.gsub!(/[^\x01-\x7f]/,'')
104
- date.to_s.strip.gsub(/\s\s+/, ' ')
105
- end
106
- end
107
- private
108
-
109
-
110
- #converts date to specified format
111
- def convert(fuzzy_date, date_format)
112
- if date_format.nil?
113
- # Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
114
- tokens = fuzzy_date.split(/\D/)
115
- if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
116
- # Its ISO
117
- return DateTime.parse(fuzzy_date.to_s).to_date
118
- else
119
- # Guessing US
120
- return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
121
- end
122
- else
123
- case date_format
124
- when ':year_quarter'
125
- return year_quarter_formatter(fuzzy_date)
126
- when ':excel-1900'
127
- return excel_1900_formatter(fuzzy_date)
128
- else #regular ruby formatter
129
- return regular_formatter(fuzzy_date, date_format)
130
- end
131
-
132
- end
133
- end
134
-
135
-
136
- def year_quarter_formatter(fuzzy_date)
137
- raw_date = fuzzy_date
138
- tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
139
- tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
140
- Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
141
- end
142
-
143
- def excel_1900_formatter(fuzzy_date)
144
- # handle Lotus 123 bug has 1900 as a leap year
145
- Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
146
- end
147
-
148
- def regular_formatter(fuzzy_date, date_format)
149
- # We have a date format - oh so pretty, but...
150
- date_string = fuzzy_date
151
- # normalize delimiters to hyphens so we do not have to make a format for each one.
152
- # delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
153
- # only if no format where provided
154
- date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
155
-
156
- #epoch date string
157
- if date_format == 'epoch'
158
- news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
159
- formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
160
- else
161
- if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
162
- century = $2.to_i < 25 ? '20' : '19'
163
- date_string = "#{$1} #{century}#{$2}"
164
- formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
165
- else
166
- formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
167
- end
168
- end
169
- formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
170
- formatted_date
171
- end
172
-
173
-
174
-
175
- #find good example of date to use as template for format
176
- #if strict == true, no ambiguity is tolerated. If strict= false, we will accept abbiguity. (02/05/2009)
177
- def find_good_date(all_dates, strict=true)
178
- good_sample=nil
179
- all_dates.each do |fuzzy_date|
180
- if usable_cell(fuzzy_date,strict)
181
- good_sample = fuzzy_date
182
- break
183
- end
184
- end
185
- if good_sample == nil and strict==true
186
- # We could not find a single unambiguous cell. Let's now be less strict and see if we can find something
187
- find_good_date(all_dates,false)
188
- else
189
- good_sample
190
- end
191
- end
192
-
193
- # if strict == true then we refuse to accept any ambiguity
194
- # if strict == false, we'll settle for a bit of ambiguity
195
- def usable_cell(cell,strict)
196
- return false if cell.nil? || cell.to_s.empty?
197
- return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
198
-
199
- return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
200
-
201
- # date is not usable as an example if it is ambiguous as to day and month
202
- # 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
203
- if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
204
- if re[1].to_i <= 12 and re[2].to_i <= 12
205
- return strict==true ? false : true
206
- else
207
- return true
208
- end
209
- end
210
-
211
- if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
212
- if re[1].to_i <= 12 and re[2].to_i <= 12
213
- return false
214
- else
215
- return true
216
- end
217
- end
218
-
219
- return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
220
-
221
- return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
222
-
223
- false # Thank you, come again
224
- end
225
-
226
- # Bump date to the end of the respective periods
227
- def frequency_transform(date, frequency)
228
- case frequency
229
- when 'annual'
230
- date = Date.new(date.year,12,31)
231
- when 'quarterly'
232
- month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
233
- date = Date.new(date.year, month, 1).next_month-1
234
- when 'monthly'
235
- date = Date.new(date.year, date.month,1).next_month-1
236
- else
237
- # Do nothing for daily or weekly
238
- end
239
-
240
- date
241
- end
242
-
243
-
244
- end
245
- end
246
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ #responsible for number formatting
5
+ class DateMaid
6
+ @defaults = {
7
+ :format => nil
8
+ }
9
+
10
+ @settings = @defaults #init with defaults
11
+
12
+ class << self
13
+
14
+ def init(user_settings)
15
+ @settings=@defaults.merge(user_settings)
16
+ end
17
+
18
+ #looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
19
+ def sweep(all_dates)
20
+ return nil if all_dates.nil?
21
+
22
+ all_dates = disinfect all_dates
23
+
24
+ if @settings[:format].nil?
25
+ #find good example and extract all info from it and apply it to each of the dates in the set
26
+ good_sample = find_good_date(all_dates)
27
+
28
+ raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
29
+
30
+ date_format, frequency = analyze_date_format(good_sample)
31
+
32
+
33
+ else
34
+ date_format = @settings[:format]
35
+ end
36
+
37
+ iso_dates=[]
38
+ all_dates.each_with_index do |fuzzy_date, i|
39
+ temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
40
+ iso_dates << frequency_transform(temp_date, frequency)
41
+ end
42
+
43
+ iso_dates
44
+ end
45
+
46
+ def analyze_date_format(example)
47
+ return nil if example.nil?
48
+
49
+ # Regular formats and Custom formats (where Date.parse and Date.strptime
50
+ # fear to tread)
51
+ if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
52
+ if re[1].to_i > 12
53
+ return '%d-%m-%Y', nil
54
+ else
55
+ return '%m-%d-%Y', nil
56
+ end
57
+ end
58
+ if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
59
+ if re[1].to_i > 12
60
+ return '%d-%m-%y', nil
61
+ else
62
+ return '%m-%d-%y', nil
63
+ end
64
+ end
65
+ # order these guys from most specific to most general
66
+ return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
67
+ return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
68
+ return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
69
+ return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
70
+ return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
71
+ return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
72
+ return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
73
+ return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
74
+ return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
75
+ return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
76
+ return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
77
+ return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
78
+ return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
79
+ return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
80
+ return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
81
+ return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
82
+ return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
83
+ return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
84
+ return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
85
+ return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
86
+ return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
87
+ return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
88
+ return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
89
+
90
+ #our custom formats
91
+ return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
92
+ return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
93
+ return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
94
+
95
+ # No, try default date parse
96
+ # raise PostProcessorException, "Unable to guess date format for #{example}"
97
+ [nil, nil]
98
+ end
99
+
100
+ def disinfect(dates)
101
+ [*dates].collect do |date|
102
+ date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
103
+ date.to_s.gsub!(/[^\x01-\x7f]/,'')
104
+ date.to_s.strip.gsub(/\s\s+/, ' ')
105
+ end
106
+ end
107
+ private
108
+
109
+
110
+ #converts date to specified format
111
+ def convert(fuzzy_date, date_format)
112
+ if date_format.nil?
113
+ # Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
114
+ tokens = fuzzy_date.split(/\D/)
115
+ if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
116
+ # Its ISO
117
+ return DateTime.parse(fuzzy_date.to_s).to_date
118
+ else
119
+ # Guessing US
120
+ return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
121
+ end
122
+ else
123
+ case date_format
124
+ when ':year_quarter'
125
+ return year_quarter_formatter(fuzzy_date)
126
+ when ':excel-1900'
127
+ return excel_1900_formatter(fuzzy_date)
128
+ else #regular ruby formatter
129
+ return regular_formatter(fuzzy_date, date_format)
130
+ end
131
+
132
+ end
133
+ end
134
+
135
+
136
+ def year_quarter_formatter(fuzzy_date)
137
+ raw_date = fuzzy_date
138
+ tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
139
+ tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
140
+ Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
141
+ end
142
+
143
+ def excel_1900_formatter(fuzzy_date)
144
+ # handle Lotus 123 bug has 1900 as a leap year
145
+ Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
146
+ end
147
+
148
+ def regular_formatter(fuzzy_date, date_format)
149
+ # We have a date format - oh so pretty, but...
150
+ date_string = fuzzy_date
151
+ # normalize delimiters to hyphens so we do not have to make a format for each one.
152
+ # delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
153
+ # only if no format where provided
154
+ date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
155
+
156
+ #epoch date string
157
+ if date_format == 'epoch'
158
+ news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
159
+ formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
160
+ else
161
+ if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
162
+ century = $2.to_i < 25 ? '20' : '19'
163
+ date_string = "#{$1} #{century}#{$2}"
164
+ formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
165
+ else
166
+ formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
167
+ end
168
+ end
169
+ formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
170
+ formatted_date
171
+ end
172
+
173
+
174
+
175
+ #find good example of date to use as template for format
176
+ #if strict == true, no ambiguity is tolerated. If strict= false, we will accept abbiguity. (02/05/2009)
177
+ def find_good_date(all_dates, strict=true)
178
+ good_sample=nil
179
+ all_dates.each do |fuzzy_date|
180
+ if usable_cell(fuzzy_date,strict)
181
+ good_sample = fuzzy_date
182
+ break
183
+ end
184
+ end
185
+ if good_sample == nil and strict==true
186
+ # We could not find a single unambiguous cell. Let's now be less strict and see if we can find something
187
+ find_good_date(all_dates,false)
188
+ else
189
+ good_sample
190
+ end
191
+ end
192
+
193
+ # if strict == true then we refuse to accept any ambiguity
194
+ # if strict == false, we'll settle for a bit of ambiguity
195
+ def usable_cell(cell,strict)
196
+ return false if cell.nil? || cell.to_s.empty?
197
+ return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
198
+
199
+ return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
200
+
201
+ # date is not usable as an example if it is ambiguous as to day and month
202
+ # 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
203
+ if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
204
+ if re[1].to_i <= 12 and re[2].to_i <= 12
205
+ return strict==true ? false : true
206
+ else
207
+ return true
208
+ end
209
+ end
210
+
211
+ if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
212
+ if re[1].to_i <= 12 and re[2].to_i <= 12
213
+ return false
214
+ else
215
+ return true
216
+ end
217
+ end
218
+
219
+ return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
220
+
221
+ return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
222
+
223
+ false # Thank you, come again
224
+ end
225
+
226
+ # Bump date to the end of the respective periods
227
+ def frequency_transform(date, frequency)
228
+ case frequency
229
+ when 'annual'
230
+ date = Date.new(date.year,12,31)
231
+ when 'quarterly'
232
+ month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
233
+ date = Date.new(date.year, month, 1).next_month-1
234
+ when 'monthly'
235
+ date = Date.new(date.year, date.month,1).next_month-1
236
+ else
237
+ # Do nothing for daily or weekly
238
+ end
239
+
240
+ date
241
+ end
242
+
243
+
244
+ end
245
+ end
246
+ end
247
247
  end