quandl_babelfish 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -7
- data/.travis.yml +12 -12
- data/Gemfile +1 -1
- data/LICENSE +7 -7
- data/README.md +18 -18
- data/UPGRADE.md +31 -29
- data/lib/quandl/babelfish/chronometer.rb +43 -43
- data/lib/quandl/babelfish/cleaner.rb +32 -32
- data/lib/quandl/babelfish/date_maid.rb +237 -237
- data/lib/quandl/babelfish/helper.rb +9 -0
- data/lib/quandl/babelfish/number_maid.rb +79 -79
- data/lib/quandl/babelfish/version.rb +4 -4
- data/lib/quandl/babelfish.rb +28 -27
- data/lib/quandl/error/guess_date_format.rb +4 -4
- data/lib/quandl/error/invalid_date.rb +4 -4
- data/lib/quandl/error/standard.rb +26 -26
- data/lib/quandl/error/unknown_date_format.rb +4 -4
- data/quandl_babelfish.gemspec +21 -21
- data/spec/lib/quandl/babelfish/chronometer_spec.rb +50 -50
- data/spec/lib/quandl/babelfish/cleaner_spec.rb +70 -49
- data/spec/lib/quandl/babelfish/date_maid_spec.rb +528 -528
- data/spec/lib/quandl/babelfish/helper_spec.rb +45 -0
- data/spec/lib/quandl/babelfish/number_maid_spec.rb +126 -126
- data/spec/lib/quandl/babelfish_spec.rb +15 -15
- data/spec/spec_helper.rb +12 -12
- data/spec/support/matchers/be_eq_at_index.rb +31 -31
- metadata +18 -17
- checksums.yaml +0 -7
@@ -1,238 +1,238 @@
|
|
1
|
-
module Quandl
|
2
|
-
module Babelfish
|
3
|
-
|
4
|
-
#responsible for number formatting
|
5
|
-
class DateMaid
|
6
|
-
@defaults = {
|
7
|
-
:format => nil
|
8
|
-
}
|
9
|
-
|
10
|
-
@settings = @defaults #init with defaults
|
11
|
-
|
12
|
-
class << self
|
13
|
-
|
14
|
-
def init(user_settings)
|
15
|
-
@settings=@defaults.merge(user_settings)
|
16
|
-
end
|
17
|
-
|
18
|
-
#looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
|
19
|
-
def sweep(all_dates)
|
20
|
-
return nil if all_dates.nil?
|
21
|
-
|
22
|
-
all_dates = disinfect all_dates
|
23
|
-
|
24
|
-
if @settings[:format].nil?
|
25
|
-
#find good example and extract all info from it and apply it to each of the dates in the set
|
26
|
-
good_sample = find_good_date(all_dates)
|
27
|
-
|
28
|
-
raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
|
29
|
-
|
30
|
-
date_format, frequency = analyze_date_format(good_sample)
|
31
|
-
|
32
|
-
|
33
|
-
else
|
34
|
-
date_format = @settings[:format]
|
35
|
-
end
|
36
|
-
|
37
|
-
iso_dates=[]
|
38
|
-
all_dates.each_with_index do |fuzzy_date, i|
|
39
|
-
temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
|
40
|
-
iso_dates << frequency_transform(temp_date, frequency)
|
41
|
-
end
|
42
|
-
|
43
|
-
iso_dates
|
44
|
-
end
|
45
|
-
|
46
|
-
def analyze_date_format(example)
|
47
|
-
return nil if example.nil?
|
48
|
-
|
49
|
-
# Regular formats and Custom formats (where Date.parse and Date.strptime
|
50
|
-
# fear to tread)
|
51
|
-
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
|
52
|
-
if re[1].to_i > 12
|
53
|
-
return '%d-%m-%Y', nil
|
54
|
-
else
|
55
|
-
return '%m-%d-%Y', nil
|
56
|
-
end
|
57
|
-
end
|
58
|
-
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
|
59
|
-
if re[1].to_i > 12
|
60
|
-
return '%d-%m-%y', nil
|
61
|
-
else
|
62
|
-
return '%m-%d-%y', nil
|
63
|
-
end
|
64
|
-
end
|
65
|
-
# order these guys from most specific to most general
|
66
|
-
return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
|
67
|
-
return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
|
68
|
-
return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
|
69
|
-
return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
|
70
|
-
return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
|
71
|
-
return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
|
72
|
-
return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
|
73
|
-
return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
|
74
|
-
return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
|
75
|
-
return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
|
76
|
-
return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
|
77
|
-
return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
|
78
|
-
return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
|
79
|
-
return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
|
80
|
-
return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
|
81
|
-
return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
|
82
|
-
return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
|
83
|
-
return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
|
84
|
-
return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
|
85
|
-
return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
|
86
|
-
return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
|
87
|
-
return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
|
88
|
-
return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
|
89
|
-
|
90
|
-
#our custom formats
|
91
|
-
return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
|
92
|
-
return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
|
93
|
-
return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
|
94
|
-
|
95
|
-
# No, try default date parse
|
96
|
-
# raise PostProcessorException, "Unable to guess date format for #{example}"
|
97
|
-
[nil, nil]
|
98
|
-
end
|
99
|
-
|
100
|
-
def disinfect(dates)
|
101
|
-
[*dates].collect do |date|
|
102
|
-
date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
103
|
-
date.to_s.gsub!(/[^\x01-\x7f]/,'')
|
104
|
-
date.to_s.strip.gsub(/\s\s+/, ' ')
|
105
|
-
end
|
106
|
-
end
|
107
|
-
private
|
108
|
-
|
109
|
-
|
110
|
-
#converts date to specified format
|
111
|
-
def convert(fuzzy_date, date_format)
|
112
|
-
if date_format.nil?
|
113
|
-
# Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
|
114
|
-
tokens = fuzzy_date.split(/\D/)
|
115
|
-
if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
|
116
|
-
# Its ISO
|
117
|
-
return DateTime.parse(fuzzy_date.to_s).to_date
|
118
|
-
else
|
119
|
-
# Guessing US
|
120
|
-
return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
|
121
|
-
end
|
122
|
-
else
|
123
|
-
case date_format
|
124
|
-
when ':year_quarter'
|
125
|
-
return year_quarter_formatter(fuzzy_date)
|
126
|
-
when ':excel-1900'
|
127
|
-
return excel_1900_formatter(fuzzy_date)
|
128
|
-
else #regular ruby formatter
|
129
|
-
return regular_formatter(fuzzy_date, date_format)
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
|
136
|
-
def year_quarter_formatter(fuzzy_date)
|
137
|
-
raw_date = fuzzy_date
|
138
|
-
tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
|
139
|
-
tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
|
140
|
-
Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
|
141
|
-
end
|
142
|
-
|
143
|
-
def excel_1900_formatter(fuzzy_date)
|
144
|
-
# handle Lotus 123 bug has 1900 as a leap year
|
145
|
-
Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
|
146
|
-
end
|
147
|
-
|
148
|
-
def regular_formatter(fuzzy_date, date_format)
|
149
|
-
# We have a date format - oh so pretty, but...
|
150
|
-
date_string = fuzzy_date
|
151
|
-
# normalize delimiters to hyphens so we do not have to make a format for each one.
|
152
|
-
# delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
|
153
|
-
# only if no format where provided
|
154
|
-
date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
|
155
|
-
|
156
|
-
#epoch date string
|
157
|
-
if date_format == 'epoch'
|
158
|
-
news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
|
159
|
-
formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
|
160
|
-
else
|
161
|
-
if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
|
162
|
-
century = $2.to_i < 25 ? '20' : '19'
|
163
|
-
date_string = "#{$1} #{century}#{$2}"
|
164
|
-
formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
|
165
|
-
else
|
166
|
-
formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
|
167
|
-
end
|
168
|
-
end
|
169
|
-
formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
|
170
|
-
formatted_date
|
171
|
-
end
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
#find good example of date to use as template for format
|
176
|
-
def find_good_date(all_dates)
|
177
|
-
good_sample=nil
|
178
|
-
all_dates.each do |fuzzy_date|
|
179
|
-
if usable_cell(fuzzy_date)
|
180
|
-
good_sample = fuzzy_date
|
181
|
-
break
|
182
|
-
end
|
183
|
-
end
|
184
|
-
good_sample
|
185
|
-
end
|
186
|
-
|
187
|
-
def usable_cell(cell)
|
188
|
-
return false if cell.nil? || cell.to_s.empty?
|
189
|
-
return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
|
190
|
-
|
191
|
-
return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
|
192
|
-
# date is not usable as an example if it is ambiguous as to day and month
|
193
|
-
# 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
|
194
|
-
if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
|
195
|
-
if re[1].to_i <= 12 and re[2].to_i <= 12
|
196
|
-
return false
|
197
|
-
else
|
198
|
-
return true
|
199
|
-
end
|
200
|
-
end
|
201
|
-
|
202
|
-
if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
|
203
|
-
if re[1].to_i <= 12 and re[2].to_i <= 12
|
204
|
-
return false
|
205
|
-
else
|
206
|
-
return true
|
207
|
-
end
|
208
|
-
end
|
209
|
-
|
210
|
-
return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
|
211
|
-
|
212
|
-
return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
|
213
|
-
|
214
|
-
false # Thank you, come again
|
215
|
-
end
|
216
|
-
|
217
|
-
# Bump date to the end of the respective periods
|
218
|
-
def frequency_transform(date, frequency)
|
219
|
-
case frequency
|
220
|
-
when 'annual'
|
221
|
-
date = Date.new(date.year,12,31)
|
222
|
-
when 'quarterly'
|
223
|
-
month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
|
224
|
-
date = Date.new(date.year, month, 1).next_month-1
|
225
|
-
when 'monthly'
|
226
|
-
date = Date.new(date.year, date.month,1).next_month-1
|
227
|
-
else
|
228
|
-
# Do nothing for daily or weekly
|
229
|
-
end
|
230
|
-
|
231
|
-
date
|
232
|
-
end
|
233
|
-
|
234
|
-
|
235
|
-
end
|
236
|
-
end
|
237
|
-
end
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
#responsible for number formatting
|
5
|
+
class DateMaid
|
6
|
+
@defaults = {
|
7
|
+
:format => nil
|
8
|
+
}
|
9
|
+
|
10
|
+
@settings = @defaults #init with defaults
|
11
|
+
|
12
|
+
class << self
|
13
|
+
|
14
|
+
def init(user_settings)
|
15
|
+
@settings=@defaults.merge(user_settings)
|
16
|
+
end
|
17
|
+
|
18
|
+
#looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
|
19
|
+
def sweep(all_dates)
|
20
|
+
return nil if all_dates.nil?
|
21
|
+
|
22
|
+
all_dates = disinfect all_dates
|
23
|
+
|
24
|
+
if @settings[:format].nil?
|
25
|
+
#find good example and extract all info from it and apply it to each of the dates in the set
|
26
|
+
good_sample = find_good_date(all_dates)
|
27
|
+
|
28
|
+
raise( Error::GuessDateFormat.new, "Unable to find date format for provided dates" ) if good_sample.nil?
|
29
|
+
|
30
|
+
date_format, frequency = analyze_date_format(good_sample)
|
31
|
+
|
32
|
+
|
33
|
+
else
|
34
|
+
date_format = @settings[:format]
|
35
|
+
end
|
36
|
+
|
37
|
+
iso_dates=[]
|
38
|
+
all_dates.each_with_index do |fuzzy_date, i|
|
39
|
+
temp_date = convert(fuzzy_date, date_format) rescue raise( Error::InvalidDate.new( line: i+1, row: fuzzy_date, context: 'convert' ), "Invalid date '#{fuzzy_date}'" )
|
40
|
+
iso_dates << frequency_transform(temp_date, frequency)
|
41
|
+
end
|
42
|
+
|
43
|
+
iso_dates
|
44
|
+
end
|
45
|
+
|
46
|
+
def analyze_date_format(example)
|
47
|
+
return nil if example.nil?
|
48
|
+
|
49
|
+
# Regular formats and Custom formats (where Date.parse and Date.strptime
|
50
|
+
# fear to tread)
|
51
|
+
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
|
52
|
+
if re[1].to_i > 12
|
53
|
+
return '%d-%m-%Y', nil
|
54
|
+
else
|
55
|
+
return '%m-%d-%Y', nil
|
56
|
+
end
|
57
|
+
end
|
58
|
+
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
|
59
|
+
if re[1].to_i > 12
|
60
|
+
return '%d-%m-%y', nil
|
61
|
+
else
|
62
|
+
return '%m-%d-%y', nil
|
63
|
+
end
|
64
|
+
end
|
65
|
+
# order these guys from most specific to most general
|
66
|
+
return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
|
67
|
+
return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
|
68
|
+
return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
|
69
|
+
return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
|
70
|
+
return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
|
71
|
+
return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
|
72
|
+
return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
|
73
|
+
return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
|
74
|
+
return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
|
75
|
+
return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
|
76
|
+
return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
|
77
|
+
return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
|
78
|
+
return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
|
79
|
+
return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
|
80
|
+
return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
|
81
|
+
return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
|
82
|
+
return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
|
83
|
+
return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
|
84
|
+
return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
|
85
|
+
return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
|
86
|
+
return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
|
87
|
+
return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
|
88
|
+
return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
|
89
|
+
|
90
|
+
#our custom formats
|
91
|
+
return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
|
92
|
+
return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
|
93
|
+
return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
|
94
|
+
|
95
|
+
# No, try default date parse
|
96
|
+
# raise PostProcessorException, "Unable to guess date format for #{example}"
|
97
|
+
[nil, nil]
|
98
|
+
end
|
99
|
+
|
100
|
+
def disinfect(dates)
|
101
|
+
[*dates].collect do |date|
|
102
|
+
date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
103
|
+
date.to_s.gsub!(/[^\x01-\x7f]/,'')
|
104
|
+
date.to_s.strip.gsub(/\s\s+/, ' ')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
private
|
108
|
+
|
109
|
+
|
110
|
+
#converts date to specified format
|
111
|
+
def convert(fuzzy_date, date_format)
|
112
|
+
if date_format.nil?
|
113
|
+
# Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
|
114
|
+
tokens = fuzzy_date.split(/\D/)
|
115
|
+
if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
|
116
|
+
# Its ISO
|
117
|
+
return DateTime.parse(fuzzy_date.to_s).to_date
|
118
|
+
else
|
119
|
+
# Guessing US
|
120
|
+
return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
|
121
|
+
end
|
122
|
+
else
|
123
|
+
case date_format
|
124
|
+
when ':year_quarter'
|
125
|
+
return year_quarter_formatter(fuzzy_date)
|
126
|
+
when ':excel-1900'
|
127
|
+
return excel_1900_formatter(fuzzy_date)
|
128
|
+
else #regular ruby formatter
|
129
|
+
return regular_formatter(fuzzy_date, date_format)
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
def year_quarter_formatter(fuzzy_date)
|
137
|
+
raw_date = fuzzy_date
|
138
|
+
tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
|
139
|
+
tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
|
140
|
+
Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
|
141
|
+
end
|
142
|
+
|
143
|
+
def excel_1900_formatter(fuzzy_date)
|
144
|
+
# handle Lotus 123 bug has 1900 as a leap year
|
145
|
+
Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
|
146
|
+
end
|
147
|
+
|
148
|
+
def regular_formatter(fuzzy_date, date_format)
|
149
|
+
# We have a date format - oh so pretty, but...
|
150
|
+
date_string = fuzzy_date
|
151
|
+
# normalize delimiters to hyphens so we do not have to make a format for each one.
|
152
|
+
# delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
|
153
|
+
# only if no format where provided
|
154
|
+
date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
|
155
|
+
|
156
|
+
#epoch date string
|
157
|
+
if date_format == 'epoch'
|
158
|
+
news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
|
159
|
+
formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
|
160
|
+
else
|
161
|
+
if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
|
162
|
+
century = $2.to_i < 25 ? '20' : '19'
|
163
|
+
date_string = "#{$1} #{century}#{$2}"
|
164
|
+
formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
|
165
|
+
else
|
166
|
+
formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
|
167
|
+
end
|
168
|
+
end
|
169
|
+
formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
|
170
|
+
formatted_date
|
171
|
+
end
|
172
|
+
|
173
|
+
|
174
|
+
|
175
|
+
#find good example of date to use as template for format
|
176
|
+
def find_good_date(all_dates)
|
177
|
+
good_sample=nil
|
178
|
+
all_dates.each do |fuzzy_date|
|
179
|
+
if usable_cell(fuzzy_date)
|
180
|
+
good_sample = fuzzy_date
|
181
|
+
break
|
182
|
+
end
|
183
|
+
end
|
184
|
+
good_sample
|
185
|
+
end
|
186
|
+
|
187
|
+
def usable_cell(cell)
|
188
|
+
return false if cell.nil? || cell.to_s.empty?
|
189
|
+
return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
|
190
|
+
|
191
|
+
return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
|
192
|
+
# date is not usable as an example if it is ambiguous as to day and month
|
193
|
+
# 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
|
194
|
+
if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
|
195
|
+
if re[1].to_i <= 12 and re[2].to_i <= 12
|
196
|
+
return false
|
197
|
+
else
|
198
|
+
return true
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
|
203
|
+
if re[1].to_i <= 12 and re[2].to_i <= 12
|
204
|
+
return false
|
205
|
+
else
|
206
|
+
return true
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
|
211
|
+
|
212
|
+
return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
|
213
|
+
|
214
|
+
false # Thank you, come again
|
215
|
+
end
|
216
|
+
|
217
|
+
# Bump date to the end of the respective periods
|
218
|
+
def frequency_transform(date, frequency)
|
219
|
+
case frequency
|
220
|
+
when 'annual'
|
221
|
+
date = Date.new(date.year,12,31)
|
222
|
+
when 'quarterly'
|
223
|
+
month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
|
224
|
+
date = Date.new(date.year, month, 1).next_month-1
|
225
|
+
when 'monthly'
|
226
|
+
date = Date.new(date.year, date.month,1).next_month-1
|
227
|
+
else
|
228
|
+
# Do nothing for daily or weekly
|
229
|
+
end
|
230
|
+
|
231
|
+
date
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
238
|
end
|
@@ -1,80 +1,80 @@
|
|
1
|
-
module Quandl
|
2
|
-
module Babelfish
|
3
|
-
|
4
|
-
#responsible for number cleaning
|
5
|
-
class NumberMaid
|
6
|
-
@defaults = {
|
7
|
-
:decimal_mark => Regexp.escape('.'),
|
8
|
-
:ignore_brackets => false, # Brackets ARE negative by default
|
9
|
-
}
|
10
|
-
|
11
|
-
@settings = @defaults #init with defaults
|
12
|
-
|
13
|
-
class << self
|
14
|
-
|
15
|
-
|
16
|
-
def init(user_settings)
|
17
|
-
@settings=@defaults.merge(user_settings)
|
18
|
-
@escaped_decimal = Regexp.escape(@settings[:decimal_mark])
|
19
|
-
end
|
20
|
-
|
21
|
-
#cleans each number one by one
|
22
|
-
def clean(dirty_numbers)
|
23
|
-
return nil if dirty_numbers.nil?
|
24
|
-
numbers=[]
|
25
|
-
Array(dirty_numbers).each do |cell|
|
26
|
-
numbers << cell_to_number(cell.to_s)
|
27
|
-
end
|
28
|
-
|
29
|
-
(numbers.size == 1) ? numbers[0] : numbers
|
30
|
-
end
|
31
|
-
|
32
|
-
def cell_to_number(num)
|
33
|
-
return nil if num.nil?
|
34
|
-
# Remove annotations
|
35
|
-
# if there is something in parenthesis and a number elsewhere, nuke the parenthesis
|
36
|
-
temp = num.gsub(/[\(\[\{].*[\)\}\]]/, '')
|
37
|
-
num = temp if temp.match(/\d/)
|
38
|
-
|
39
|
-
num.gsub!("est.", '')
|
40
|
-
|
41
|
-
#check for exponents by searching for 'e' 'E' or variations of 'x 10' '*10' and 'X10^'
|
42
|
-
is_exp = false
|
43
|
-
expmultiplier = 1
|
44
|
-
m = /(\s)*(E|e|[X|x|\*](\s)*10(\^)?)(\s)*/.match(num)
|
45
|
-
#check if match is made, preceeded by a number/decimal, and succeeded by a digit or a plus/minus sign
|
46
|
-
if !m.nil? and m.pre_match =~ /[0-9#{@escaped_decimal}]$/ and m.post_match =~ /^([\-+0-9])/
|
47
|
-
is_exp = true
|
48
|
-
num = m.pre_match
|
49
|
-
expmultiplier = 10 ** /^[0-9\-+]*/.match(m.post_match)[0].to_i
|
50
|
-
end
|
51
|
-
is_million = (num =~ /million/i)
|
52
|
-
is_billion = (num =~ /billion/i)
|
53
|
-
is_negative = (num =~ /-[\d]/ or (!@settings[:ignore_brackets] and num =~ /\([\d]/))
|
54
|
-
|
55
|
-
# watch out for two numbers, like a range eg "27.3 - 33.9"
|
56
|
-
# how: if you a see a number followed by a non number char that is not the decimal marker, kill everything to the right of that
|
57
|
-
num.gsub!(/(\d) (\d)/, '\1\2')
|
58
|
-
if m = num.match(/-?\s*[,\d\.]+/)
|
59
|
-
num = m[0]
|
60
|
-
end
|
61
|
-
|
62
|
-
# only keep #s and decimal mark
|
63
|
-
num.gsub!(/[^0-9#{@escaped_decimal}]/, '')
|
64
|
-
num.gsub!(/[^0-9]/, '.')
|
65
|
-
|
66
|
-
return nil if num.nil? || num !~ /[\d]/
|
67
|
-
return nil if num.end_with?(".")
|
68
|
-
return nil if num.count(".") > 1
|
69
|
-
cell = num.nil? ? 0.0 : Float("%.#{14}g" % num)
|
70
|
-
cell *= 1e6 if is_million
|
71
|
-
cell *= 1e9 if is_billion
|
72
|
-
cell *= -1 if is_negative
|
73
|
-
cell *= expmultiplier if is_exp
|
74
|
-
cell
|
75
|
-
end
|
76
|
-
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
#responsible for number cleaning
|
5
|
+
class NumberMaid
|
6
|
+
@defaults = {
|
7
|
+
:decimal_mark => Regexp.escape('.'),
|
8
|
+
:ignore_brackets => false, # Brackets ARE negative by default
|
9
|
+
}
|
10
|
+
|
11
|
+
@settings = @defaults #init with defaults
|
12
|
+
|
13
|
+
class << self
|
14
|
+
|
15
|
+
|
16
|
+
def init(user_settings)
|
17
|
+
@settings=@defaults.merge(user_settings)
|
18
|
+
@escaped_decimal = Regexp.escape(@settings[:decimal_mark])
|
19
|
+
end
|
20
|
+
|
21
|
+
#cleans each number one by one
|
22
|
+
def clean(dirty_numbers)
|
23
|
+
return nil if dirty_numbers.nil?
|
24
|
+
numbers=[]
|
25
|
+
Array(dirty_numbers).each do |cell|
|
26
|
+
numbers << cell_to_number(cell.to_s)
|
27
|
+
end
|
28
|
+
|
29
|
+
(numbers.size == 1) ? numbers[0] : numbers
|
30
|
+
end
|
31
|
+
|
32
|
+
def cell_to_number(num)
|
33
|
+
return nil if num.nil?
|
34
|
+
# Remove annotations
|
35
|
+
# if there is something in parenthesis and a number elsewhere, nuke the parenthesis
|
36
|
+
temp = num.gsub(/[\(\[\{].*[\)\}\]]/, '')
|
37
|
+
num = temp if temp.match(/\d/)
|
38
|
+
|
39
|
+
num.gsub!("est.", '')
|
40
|
+
|
41
|
+
#check for exponents by searching for 'e' 'E' or variations of 'x 10' '*10' and 'X10^'
|
42
|
+
is_exp = false
|
43
|
+
expmultiplier = 1
|
44
|
+
m = /(\s)*(E|e|[X|x|\*](\s)*10(\^)?)(\s)*/.match(num)
|
45
|
+
#check if match is made, preceeded by a number/decimal, and succeeded by a digit or a plus/minus sign
|
46
|
+
if !m.nil? and m.pre_match =~ /[0-9#{@escaped_decimal}]$/ and m.post_match =~ /^([\-+0-9])/
|
47
|
+
is_exp = true
|
48
|
+
num = m.pre_match
|
49
|
+
expmultiplier = 10 ** /^[0-9\-+]*/.match(m.post_match)[0].to_i
|
50
|
+
end
|
51
|
+
is_million = (num =~ /million/i)
|
52
|
+
is_billion = (num =~ /billion/i)
|
53
|
+
is_negative = (num =~ /-[\d]/ or (!@settings[:ignore_brackets] and num =~ /\([\d]/))
|
54
|
+
|
55
|
+
# watch out for two numbers, like a range eg "27.3 - 33.9"
|
56
|
+
# how: if you a see a number followed by a non number char that is not the decimal marker, kill everything to the right of that
|
57
|
+
num.gsub!(/(\d) (\d)/, '\1\2')
|
58
|
+
if m = num.match(/-?\s*[,\d\.]+/)
|
59
|
+
num = m[0]
|
60
|
+
end
|
61
|
+
|
62
|
+
# only keep #s and decimal mark
|
63
|
+
num.gsub!(/[^0-9#{@escaped_decimal}]/, '')
|
64
|
+
num.gsub!(/[^0-9]/, '.')
|
65
|
+
|
66
|
+
return nil if num.nil? || num !~ /[\d]/
|
67
|
+
return nil if num.end_with?(".")
|
68
|
+
return nil if num.count(".") > 1
|
69
|
+
cell = num.nil? ? 0.0 : Float("%.#{14}g" % num)
|
70
|
+
cell *= 1e6 if is_million
|
71
|
+
cell *= 1e9 if is_billion
|
72
|
+
cell *= -1 if is_negative
|
73
|
+
cell *= expmultiplier if is_exp
|
74
|
+
cell
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
80
|
end
|
@@ -1,5 +1,5 @@
|
|
1
|
-
module Quandl
|
2
|
-
module Babelfish
|
3
|
-
VERSION = '0.0.
|
4
|
-
end
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
VERSION = '0.0.7'
|
4
|
+
end
|
5
5
|
end
|