quandl_babelfish 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,7 +1,7 @@
1
- .idea/
2
- /Gemfile.lock
3
- /pkg
4
- /tmp
5
- .rvmrc
6
- *.gem
7
- *.log
1
+ .idea/
2
+ /Gemfile.lock
3
+ /pkg
4
+ /tmp
5
+ .rvmrc
6
+ *.gem
7
+ *.log
data/.travis.yml CHANGED
@@ -1,12 +1,12 @@
1
- language: ruby
2
-
3
- rvm:
4
- - 2.0.0
5
- - 1.9.3
6
-
7
- gemfile:
8
- - Gemfile
9
-
10
- matrix:
11
-
12
- script: bundle exec rspec
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 2.0.0
5
+ - 1.9.3
6
+
7
+ gemfile:
8
+ - Gemfile
9
+
10
+ matrix:
11
+
12
+ script: bundle exec rspec
data/Gemfile CHANGED
@@ -1,2 +1,2 @@
1
- source "https://rubygems.org"
1
+ source "https://rubygems.org"
2
2
  gemspec
data/LICENSE CHANGED
@@ -1,7 +1,7 @@
1
- Copyright (c) 2012-2013 Quandl
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
-
5
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
-
7
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
1
+ Copyright (c) 2012-2013 Quandl
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md CHANGED
@@ -1,18 +1,18 @@
1
- # Quandl::Babelfish
2
-
3
- ### Installation
4
-
5
- ```ruby
6
-
7
-
8
- gem 'quandl_babelfish'
9
-
10
- ```
11
-
12
-
13
- ### Usage
14
-
15
- ```ruby
16
-
17
-
18
- ```
1
+ # Quandl::Babelfish
2
+
3
+ ### Installation
4
+
5
+ ```ruby
6
+
7
+
8
+ gem 'quandl_babelfish'
9
+
10
+ ```
11
+
12
+
13
+ ### Usage
14
+
15
+ ```ruby
16
+
17
+
18
+ ```
data/UPGRADE.md CHANGED
@@ -1,32 +1,39 @@
1
- ## 0.0.7
2
- * added squaring final data
3
-
4
- ## 0.0.6
5
-
6
- * add Babelfish::Chronometer
7
- * add Babelfish.guess_frequency
8
-
9
- ## 0.0.5
10
-
11
- * improve error messages with line, row, context
12
- * add Quandl::Error::Standard, all errors inherit from Error::Standard
13
-
14
-
15
- ## 0.0.4
16
-
17
- * remove quandl_data as a dependency
18
-
19
-
20
- ## 0.0.3
21
-
22
- * Add Quandl::Data as a add_runtime_dependency
23
- * refactor Babelfish::Data to inherit from Quandl::Data
24
- * refactor specs
25
-
26
-
27
- ## 0.0.1
28
-
29
- * replace Cleaner.process return clean_array, header with Quandl::Babelfish::Data.new( clean_array, headers: header )
30
- * refactored error
31
- * added header extraction support
1
+ ## 0.0.8
2
+
3
+ * QUGC-42 you should not have to include data; you might just want to update headers
4
+
5
+
6
+ ## 0.0.7
7
+
8
+ * added squaring final data
9
+
10
+
11
+ ## 0.0.6
12
+
13
+ * add Babelfish::Chronometer
14
+ * add Babelfish.guess_frequency
15
+
16
+ ## 0.0.5
17
+
18
+ * improve error messages with line, row, context
19
+ * add Quandl::Error::Standard, all errors inherit from Error::Standard
20
+
21
+
22
+ ## 0.0.4
23
+
24
+ * remove quandl_data as a dependency
25
+
26
+
27
+ ## 0.0.3
28
+
29
+ * Add Quandl::Data as a add_runtime_dependency
30
+ * refactor Babelfish::Data to inherit from Quandl::Data
31
+ * refactor specs
32
+
33
+
34
+ ## 0.0.1
35
+
36
+ * replace Cleaner.process return clean_array, header with Quandl::Babelfish::Data.new( clean_array, headers: header )
37
+ * refactored error
38
+ * added header extraction support
32
39
  * init
@@ -1,29 +1,29 @@
1
- require "quandl/babelfish/version"
2
-
3
- require "quandl/babelfish/helper"
4
- require "quandl/babelfish/cleaner"
5
- require "quandl/babelfish/date_maid"
6
- require "quandl/babelfish/number_maid"
7
- require "quandl/babelfish/chronometer"
8
-
9
- require 'quandl/error/standard'
10
- require 'quandl/error/guess_date_format'
11
- require 'quandl/error/invalid_date'
12
- require 'quandl/error/unknown_date_format'
13
-
14
- module Quandl
15
- module Babelfish
16
-
17
- class << self
18
- def clean(data, date_settings={}, number_settings={})
19
- Cleaner::process data, date_settings, number_settings
20
- end
21
-
22
- def guess_frequency(data)
23
- Chronometer::process data
24
- end
25
-
26
- end
27
-
28
- end
1
+ require "quandl/babelfish/version"
2
+
3
+ require "quandl/babelfish/helper"
4
+ require "quandl/babelfish/cleaner"
5
+ require "quandl/babelfish/date_maid"
6
+ require "quandl/babelfish/number_maid"
7
+ require "quandl/babelfish/chronometer"
8
+
9
+ require 'quandl/error/standard'
10
+ require 'quandl/error/guess_date_format'
11
+ require 'quandl/error/invalid_date'
12
+ require 'quandl/error/unknown_date_format'
13
+
14
+ module Quandl
15
+ module Babelfish
16
+
17
+ class << self
18
+ def clean(data, date_settings={}, number_settings={})
19
+ Cleaner::process data, date_settings, number_settings
20
+ end
21
+
22
+ def guess_frequency(data)
23
+ Chronometer::process data
24
+ end
25
+
26
+ end
27
+
28
+ end
29
29
  end
@@ -1,44 +1,44 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- class Chronometer
5
- class << self
6
-
7
- #return frequency and warning message if present
8
- def process(table)
9
- # guesses date frequency in a table
10
- return nil if table.nil? || table.size==0
11
- return 'daily' if table.size==1 #not enough , need more points
12
- freqs = []
13
- fmt = "%Y-%m"
14
- fmt = "%Y" if table[0][0].to_s !~ /-/
15
- fmt = "%Y-%m-%d" if table[0][0].to_s =~ /^.*-.*-.*$/
16
-
17
- table.each_index do |r|
18
- break if r==6 #first 6 record is enough to analyze
19
- if table[r+1].nil?
20
- break
21
- else
22
- diff = (Date.strptime(table[r+1][0].to_s, fmt) -
23
- Date.strptime(table[r][0].to_s, fmt)).to_i.abs
24
- if diff < 4
25
- freqs << 'daily'
26
- elsif diff < 10
27
- freqs << 'weekly'
28
- elsif diff < 60
29
- freqs << 'monthly'
30
- elsif diff < 200
31
- freqs << 'quarterly'
32
- else
33
- freqs << 'annual'
34
- end
35
- end
36
- end
37
- return freqs.sort_by { |e| freqs.count(e) }.reverse.first#, nil
38
- end
39
-
40
- end
41
- end
42
-
43
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ class Chronometer
5
+ class << self
6
+
7
+ #return frequency and warning message if present
8
+ def process(table)
9
+ # guesses date frequency in a table
10
+ return nil if table.nil? || table.size==0
11
+ return 'daily' if table.size==1 #not enough , need more points
12
+ freqs = []
13
+ fmt = "%Y-%m"
14
+ fmt = "%Y" if table[0][0].to_s !~ /-/
15
+ fmt = "%Y-%m-%d" if table[0][0].to_s =~ /^.*-.*-.*$/
16
+
17
+ table.each_index do |r|
18
+ break if r==6 #first 6 record is enough to analyze
19
+ if table[r+1].nil?
20
+ break
21
+ else
22
+ diff = (Date.strptime(table[r+1][0].to_s, fmt) -
23
+ Date.strptime(table[r][0].to_s, fmt)).to_i.abs
24
+ if diff < 4
25
+ freqs << 'daily'
26
+ elsif diff < 10
27
+ freqs << 'weekly'
28
+ elsif diff < 60
29
+ freqs << 'monthly'
30
+ elsif diff < 200
31
+ freqs << 'quarterly'
32
+ else
33
+ freqs << 'annual'
34
+ end
35
+ end
36
+ end
37
+ return freqs.sort_by { |e| freqs.count(e) }.reverse.first#, nil
38
+ end
39
+
40
+ end
41
+ end
42
+
43
+ end
44
44
  end
@@ -1,33 +1,34 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- class Cleaner
5
- class << self
6
- def process(dirty_array, date_settings={}, number_settings={})
7
- return nil,nil if dirty_array.nil?
8
-
9
- #check if first line is header
10
- header=DateMaid::disinfect(dirty_array.shift) unless DateMaid::analyze_date_format(DateMaid::disinfect(dirty_array[0][0])[0])[0]
11
-
12
- #converts dates first
13
- dirty_array
14
- dates = dirty_array.collect{|x| x[0]}
15
- DateMaid::init(date_settings)
16
- clean_dates=DateMaid::sweep dates
17
-
18
- clean_array=[]
19
- #clean numbers later
20
- NumberMaid::init(number_settings)
21
- dirty_array.each.with_index do |row, i|
22
- new_row=[]
23
- (new_row << clean_dates[i]).concat Array(NumberMaid::clean(row[1..-1])) #add clean date and all clean numbers
24
- clean_array << new_row
25
- end
26
-
27
- return Helper::make_square(clean_array), header
28
- end
29
- end
30
- end
31
-
32
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ class Cleaner
5
+ class << self
6
+ def process(dirty_array, date_settings={}, number_settings={})
7
+ return nil,nil if dirty_array.nil?
8
+
9
+ #check if first line is header
10
+ header=DateMaid::disinfect(dirty_array.shift) unless DateMaid::analyze_date_format(DateMaid::disinfect(dirty_array[0][0])[0])[0]
11
+ return [], header if dirty_array.empty?
12
+
13
+ #converts dates first
14
+ dirty_array
15
+ dates = dirty_array.collect{|x| x[0]}
16
+ DateMaid::init(date_settings)
17
+ clean_dates=DateMaid::sweep dates
18
+
19
+ clean_array=[]
20
+ #clean numbers later
21
+ NumberMaid::init(number_settings)
22
+ dirty_array.each.with_index do |row, i|
23
+ new_row=[]
24
+ (new_row << clean_dates[i]).concat Array(NumberMaid::clean(row[1..-1])) #add clean date and all clean numbers
25
+ clean_array << new_row
26
+ end
27
+
28
+ return Helper::make_square(clean_array), header
29
+ end
30
+ end
31
+ end
32
+
33
+ end
33
34
  end
@@ -1,238 +1,238 @@
1
- module Quandl
2
- module Babelfish
3
-
4
- #responsible for number formatting
5
- class DateMaid
6
- @defaults = {
7
- :format => nil
8
- }
9
-
10
- @settings = @defaults #init with defaults
11
-
12
- class << self
13
-
14
- def init(user_settings)
15
- @settings=@defaults.merge(user_settings)
16
- end
17
-
18
- #looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
19
- def sweep(all_dates)
20
- return nil if all_dates.nil?
21
-
22
- all_dates = disinfect all_dates
23
-
24
- if @settings[:format].nil?
25
- #find good example and extract all info from it and apply it to each of the dates in the set
26
- good_sample = find_good_date(all_dates)
27
-
28
- raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
29
-
30
- date_format, frequency = analyze_date_format(good_sample)
31
-
32
-
33
- else
34
- date_format = @settings[:format]
35
- end
36
-
37
- iso_dates=[]
38
- all_dates.each_with_index do |fuzzy_date, i|
39
- temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
40
- iso_dates << frequency_transform(temp_date, frequency)
41
- end
42
-
43
- iso_dates
44
- end
45
-
46
- def analyze_date_format(example)
47
- return nil if example.nil?
48
-
49
- # Regular formats and Custom formats (where Date.parse and Date.strptime
50
- # fear to tread)
51
- if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
52
- if re[1].to_i > 12
53
- return '%d-%m-%Y', nil
54
- else
55
- return '%m-%d-%Y', nil
56
- end
57
- end
58
- if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
59
- if re[1].to_i > 12
60
- return '%d-%m-%y', nil
61
- else
62
- return '%m-%d-%y', nil
63
- end
64
- end
65
- # order these guys from most specific to most general
66
- return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
67
- return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
68
- return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
69
- return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
70
- return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
71
- return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
72
- return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
73
- return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
74
- return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
75
- return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
76
- return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
77
- return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
78
- return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
79
- return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
80
- return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
81
- return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
82
- return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
83
- return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
84
- return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
85
- return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
86
- return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
87
- return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
88
- return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
89
-
90
- #our custom formats
91
- return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
92
- return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
93
- return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
94
-
95
- # No, try default date parse
96
- # raise PostProcessorException, "Unable to guess date format for #{example}"
97
- [nil, nil]
98
- end
99
-
100
- def disinfect(dates)
101
- [*dates].collect do |date|
102
- date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
103
- date.to_s.gsub!(/[^\x01-\x7f]/,'')
104
- date.to_s.strip.gsub(/\s\s+/, ' ')
105
- end
106
- end
107
- private
108
-
109
-
110
- #converts date to specified format
111
- def convert(fuzzy_date, date_format)
112
- if date_format.nil?
113
- # Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
114
- tokens = fuzzy_date.split(/\D/)
115
- if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
116
- # Its ISO
117
- return DateTime.parse(fuzzy_date.to_s).to_date
118
- else
119
- # Guessing US
120
- return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
121
- end
122
- else
123
- case date_format
124
- when ':year_quarter'
125
- return year_quarter_formatter(fuzzy_date)
126
- when ':excel-1900'
127
- return excel_1900_formatter(fuzzy_date)
128
- else #regular ruby formatter
129
- return regular_formatter(fuzzy_date, date_format)
130
- end
131
-
132
- end
133
- end
134
-
135
-
136
- def year_quarter_formatter(fuzzy_date)
137
- raw_date = fuzzy_date
138
- tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
139
- tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
140
- Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
141
- end
142
-
143
- def excel_1900_formatter(fuzzy_date)
144
- # handle Lotus 123 bug has 1900 as a leap year
145
- Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
146
- end
147
-
148
- def regular_formatter(fuzzy_date, date_format)
149
- # We have a date format - oh so pretty, but...
150
- date_string = fuzzy_date
151
- # normalize delimiters to hyphens so we do not have to make a format for each one.
152
- # delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
153
- # only if no format where provided
154
- date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
155
-
156
- #epoch date string
157
- if date_format == 'epoch'
158
- news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
159
- formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
160
- else
161
- if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
162
- century = $2.to_i < 25 ? '20' : '19'
163
- date_string = "#{$1} #{century}#{$2}"
164
- formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
165
- else
166
- formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
167
- end
168
- end
169
- formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
170
- formatted_date
171
- end
172
-
173
-
174
-
175
- #find good example of date to use as template for format
176
- def find_good_date(all_dates)
177
- good_sample=nil
178
- all_dates.each do |fuzzy_date|
179
- if usable_cell(fuzzy_date)
180
- good_sample = fuzzy_date
181
- break
182
- end
183
- end
184
- good_sample
185
- end
186
-
187
- def usable_cell(cell)
188
- return false if cell.nil? || cell.to_s.empty?
189
- return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
190
-
191
- return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
192
- # date is not usable as an example if it is ambiguous as to day and month
193
- # 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
194
- if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
195
- if re[1].to_i <= 12 and re[2].to_i <= 12
196
- return false
197
- else
198
- return true
199
- end
200
- end
201
-
202
- if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
203
- if re[1].to_i <= 12 and re[2].to_i <= 12
204
- return false
205
- else
206
- return true
207
- end
208
- end
209
-
210
- return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
211
-
212
- return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
213
-
214
- false # Thank you, come again
215
- end
216
-
217
- # Bump date to the end of the respective periods
218
- def frequency_transform(date, frequency)
219
- case frequency
220
- when 'annual'
221
- date = Date.new(date.year,12,31)
222
- when 'quarterly'
223
- month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
224
- date = Date.new(date.year, month, 1).next_month-1
225
- when 'monthly'
226
- date = Date.new(date.year, date.month,1).next_month-1
227
- else
228
- # Do nothing for daily or weekly
229
- end
230
-
231
- date
232
- end
233
-
234
-
235
- end
236
- end
237
- end
1
+ module Quandl
2
+ module Babelfish
3
+
4
+ #responsible for number formatting
5
+ class DateMaid
6
+ @defaults = {
7
+ :format => nil
8
+ }
9
+
10
+ @settings = @defaults #init with defaults
11
+
12
+ class << self
13
+
14
+ def init(user_settings)
15
+ @settings=@defaults.merge(user_settings)
16
+ end
17
+
18
+ #looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
19
+ def sweep(all_dates)
20
+ return nil if all_dates.nil?
21
+
22
+ all_dates = disinfect all_dates
23
+
24
+ if @settings[:format].nil?
25
+ #find good example and extract all info from it and apply it to each of the dates in the set
26
+ good_sample = find_good_date(all_dates)
27
+
28
+ raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
29
+
30
+ date_format, frequency = analyze_date_format(good_sample)
31
+
32
+
33
+ else
34
+ date_format = @settings[:format]
35
+ end
36
+
37
+ iso_dates=[]
38
+ all_dates.each_with_index do |fuzzy_date, i|
39
+ temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
40
+ iso_dates << frequency_transform(temp_date, frequency)
41
+ end
42
+
43
+ iso_dates
44
+ end
45
+
46
+ def analyze_date_format(example)
47
+ return nil if example.nil?
48
+
49
+ # Regular formats and Custom formats (where Date.parse and Date.strptime
50
+ # fear to tread)
51
+ if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
52
+ if re[1].to_i > 12
53
+ return '%d-%m-%Y', nil
54
+ else
55
+ return '%m-%d-%Y', nil
56
+ end
57
+ end
58
+ if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
59
+ if re[1].to_i > 12
60
+ return '%d-%m-%y', nil
61
+ else
62
+ return '%m-%d-%y', nil
63
+ end
64
+ end
65
+ # order these guys from most specific to most general
66
+ return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
67
+ return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
68
+ return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
69
+ return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
70
+ return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
71
+ return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
72
+ return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
73
+ return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
74
+ return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
75
+ return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
76
+ return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
77
+ return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
78
+ return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
79
+ return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
80
+ return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
81
+ return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
82
+ return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
83
+ return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
84
+ return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
85
+ return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
86
+ return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
87
+ return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
88
+ return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
89
+
90
+ #our custom formats
91
+ return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
92
+ return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
93
+ return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
94
+
95
+ # No, try default date parse
96
+ # raise PostProcessorException, "Unable to guess date format for #{example}"
97
+ [nil, nil]
98
+ end
99
+
100
+ def disinfect(dates)
101
+ [*dates].collect do |date|
102
+ date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
103
+ date.to_s.gsub!(/[^\x01-\x7f]/,'')
104
+ date.to_s.strip.gsub(/\s\s+/, ' ')
105
+ end
106
+ end
107
+ private
108
+
109
+
110
+ #converts date to specified format
111
+ def convert(fuzzy_date, date_format)
112
+ if date_format.nil?
113
+ # Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
114
+ tokens = fuzzy_date.split(/\D/)
115
+ if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
116
+ # Its ISO
117
+ return DateTime.parse(fuzzy_date.to_s).to_date
118
+ else
119
+ # Guessing US
120
+ return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
121
+ end
122
+ else
123
+ case date_format
124
+ when ':year_quarter'
125
+ return year_quarter_formatter(fuzzy_date)
126
+ when ':excel-1900'
127
+ return excel_1900_formatter(fuzzy_date)
128
+ else #regular ruby formatter
129
+ return regular_formatter(fuzzy_date, date_format)
130
+ end
131
+
132
+ end
133
+ end
134
+
135
+
136
+ def year_quarter_formatter(fuzzy_date)
137
+ raw_date = fuzzy_date
138
+ tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
139
+ tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
140
+ Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
141
+ end
142
+
143
+ def excel_1900_formatter(fuzzy_date)
144
+ # handle Lotus 123 bug has 1900 as a leap year
145
+ Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
146
+ end
147
+
148
+ def regular_formatter(fuzzy_date, date_format)
149
+ # We have a date format - oh so pretty, but...
150
+ date_string = fuzzy_date
151
+ # normalize delimiters to hyphens so we do not have to make a format for each one.
152
+ # delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
153
+ # only if no format where provided
154
+ date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
155
+
156
+ #epoch date string
157
+ if date_format == 'epoch'
158
+ news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
159
+ formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
160
+ else
161
+ if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
162
+ century = $2.to_i < 25 ? '20' : '19'
163
+ date_string = "#{$1} #{century}#{$2}"
164
+ formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
165
+ else
166
+ formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
167
+ end
168
+ end
169
+ formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
170
+ formatted_date
171
+ end
172
+
173
+
174
+
175
+ #find good example of date to use as template for format
176
+ def find_good_date(all_dates)
177
+ good_sample=nil
178
+ all_dates.each do |fuzzy_date|
179
+ if usable_cell(fuzzy_date)
180
+ good_sample = fuzzy_date
181
+ break
182
+ end
183
+ end
184
+ good_sample
185
+ end
186
+
187
+ def usable_cell(cell)
188
+ return false if cell.nil? || cell.to_s.empty?
189
+ return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
190
+
191
+ return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
192
+ # date is not usable as an example if it is ambiguous as to day and month
193
+ # 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
194
+ if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
195
+ if re[1].to_i <= 12 and re[2].to_i <= 12
196
+ return false
197
+ else
198
+ return true
199
+ end
200
+ end
201
+
202
+ if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
203
+ if re[1].to_i <= 12 and re[2].to_i <= 12
204
+ return false
205
+ else
206
+ return true
207
+ end
208
+ end
209
+
210
+ return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
211
+
212
+ return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
213
+
214
+ false # Thank you, come again
215
+ end
216
+
217
+ # Bump date to the end of the respective periods
218
+ def frequency_transform(date, frequency)
219
+ case frequency
220
+ when 'annual'
221
+ date = Date.new(date.year,12,31)
222
+ when 'quarterly'
223
+ month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
224
+ date = Date.new(date.year, month, 1).next_month-1
225
+ when 'monthly'
226
+ date = Date.new(date.year, date.month,1).next_month-1
227
+ else
228
+ # Do nothing for daily or weekly
229
+ end
230
+
231
+ date
232
+ end
233
+
234
+
235
+ end
236
+ end
237
+ end
238
238
  end