quandl_babelfish 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +7 -0
- data/.travis.yml +12 -0
- data/Gemfile +2 -0
- data/LICENSE +7 -0
- data/README.md +18 -0
- data/UPGRADE.md +6 -0
- data/lib/quandl/babelfish/cleaner.rb +33 -0
- data/lib/quandl/babelfish/data.rb +22 -0
- data/lib/quandl/babelfish/date_maid.rb +238 -0
- data/lib/quandl/babelfish/number_maid.rb +80 -0
- data/lib/quandl/babelfish/version.rb +5 -0
- data/lib/quandl/babelfish.rb +22 -0
- data/lib/quandl/error/guess_date_format.rb +6 -0
- data/lib/quandl/error/invalid_date.rb +6 -0
- data/lib/quandl/error/unknown_date_format.rb +6 -0
- data/quandl_babelfish.gemspec +21 -0
- data/spec/lib/quandl/babelfish/cleaner_spec.rb +47 -0
- data/spec/lib/quandl/babelfish/data_spec.rb +33 -0
- data/spec/lib/quandl/babelfish/date_maid_spec.rb +529 -0
- data/spec/lib/quandl/babelfish/number_maid_spec.rb +126 -0
- data/spec/lib/quandl/babelfish_spec.rb +15 -0
- data/spec/spec_helper.rb +13 -0
- data/spec/support/matchers/be_eq_at_index.rb +32 -0
- metadata +102 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 5aa7214cd7f3c0541f0aefe31d983622065bea21
|
4
|
+
data.tar.gz: 442699776fdbd64183d877e962d564ce2c770664
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 5a577f380ce3771869c553ebb418c82aa5e9dccb1f493e5cfd06f93a3d804c102df3cd6bb38039a5ddc2630d7ed132c4e68fbb6186b60dab0492f73099bd06cf
|
7
|
+
data.tar.gz: ea13348aa261380045206aacadda5c8ef4915f7ea9c9210c00f17d1c75f41bee90ba243f240da73e0431c8a573e9ab42354fe2f55e91732f464c2beb7d0663d8
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
Copyright (c) 2012-2013 Quandl
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
4
|
+
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
6
|
+
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
data/UPGRADE.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
class Cleaner
|
5
|
+
class << self
|
6
|
+
def process(dirty_array, date_settings={}, number_settings={})
|
7
|
+
return nil,nil if dirty_array.nil?
|
8
|
+
|
9
|
+
#check if first line is header
|
10
|
+
header=DateMaid::disinfect(dirty_array.shift) unless DateMaid::analyze_date_format(DateMaid::disinfect(dirty_array[0][0])[0])[0]
|
11
|
+
|
12
|
+
#converts dates first
|
13
|
+
dirty_array
|
14
|
+
dates = dirty_array.collect{|x| x[0]}
|
15
|
+
DateMaid::init(date_settings)
|
16
|
+
clean_dates=DateMaid::sweep dates
|
17
|
+
|
18
|
+
clean_array=[]
|
19
|
+
#clean numbers later
|
20
|
+
NumberMaid::init(number_settings)
|
21
|
+
dirty_array.each.with_index do |row, i|
|
22
|
+
new_row=[]
|
23
|
+
(new_row << clean_dates[i]).concat Array(NumberMaid::clean(row[1..-1])) #add clean date and all clean numbers
|
24
|
+
clean_array << new_row
|
25
|
+
end
|
26
|
+
|
27
|
+
return Quandl::Babelfish::Data.new( clean_array, headers: header )
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
class Data < Array
|
5
|
+
|
6
|
+
def initialize(*args, &block)
|
7
|
+
# do we have options?
|
8
|
+
options = args.pop if args && args.last.is_a?(Hash)
|
9
|
+
# set headers if given
|
10
|
+
@headers = options[:headers] if options && options.has_key?(:headers) && options[:headers].is_a?(Array)
|
11
|
+
# onwards and upwards
|
12
|
+
super(*args, &block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def headers
|
16
|
+
@headers ||= nil
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,238 @@
|
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
#responsible for number formatting
|
5
|
+
class DateMaid
|
6
|
+
@defaults = {
|
7
|
+
:format => nil
|
8
|
+
}
|
9
|
+
|
10
|
+
@settings = @defaults #init with defaults
|
11
|
+
|
12
|
+
class << self
|
13
|
+
|
14
|
+
def init(user_settings)
|
15
|
+
@settings=@defaults.merge(user_settings)
|
16
|
+
end
|
17
|
+
|
18
|
+
#looks at all the dates and formats them to unambiguous ISO 8601 format (yyyy-mm-dd)
|
19
|
+
def sweep(all_dates)
|
20
|
+
return nil if all_dates.nil?
|
21
|
+
|
22
|
+
all_dates = disinfect all_dates
|
23
|
+
|
24
|
+
if @settings[:format].nil?
|
25
|
+
#find good example and extract all info from it and apply it to each of the dates in the set
|
26
|
+
good_sample = find_good_date(all_dates)
|
27
|
+
|
28
|
+
raise Error::GuessDateFormat.new("Unable to find date format for provide dates") if good_sample.nil?
|
29
|
+
|
30
|
+
date_format, frequency = analyze_date_format(good_sample)
|
31
|
+
|
32
|
+
|
33
|
+
else
|
34
|
+
date_format = @settings[:format]
|
35
|
+
end
|
36
|
+
|
37
|
+
iso_dates=[]
|
38
|
+
all_dates.each do |fuzzy_date|
|
39
|
+
temp_date = convert(fuzzy_date, date_format) rescue raise(Error::InvalidDate,fuzzy_date)
|
40
|
+
iso_dates << frequency_transform(temp_date, frequency)
|
41
|
+
end
|
42
|
+
|
43
|
+
iso_dates
|
44
|
+
end
|
45
|
+
|
46
|
+
def analyze_date_format(example)
|
47
|
+
return nil if example.nil?
|
48
|
+
|
49
|
+
# Regular formats and Custom formats (where Date.parse and Date.strptime
|
50
|
+
# fear to tread)
|
51
|
+
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{4}/) # eg "07/03/2012"
|
52
|
+
if re[1].to_i > 12
|
53
|
+
return '%d-%m-%Y', nil
|
54
|
+
else
|
55
|
+
return '%m-%d-%Y', nil
|
56
|
+
end
|
57
|
+
end
|
58
|
+
if re = example.match(/^(\d{1,2})\D(\d{1,2})\D\d{2}/) # eg "07/03/12"
|
59
|
+
if re[1].to_i > 12
|
60
|
+
return '%d-%m-%y', nil
|
61
|
+
else
|
62
|
+
return '%m-%d-%y', nil
|
63
|
+
end
|
64
|
+
end
|
65
|
+
# order these guys from most specific to most general
|
66
|
+
return "%Y", "annual" if example =~ /^\d{4}[\s]?-[\s]?\d{4}$/
|
67
|
+
return '%Y%m%d', 'daily' if example =~ /^\d{8}$/ && example[4..5].to_i < 13 && example[6..7].to_i < 32 # precisely 8 digits - yyyymmdd
|
68
|
+
return 'epoch', 'daily' if example =~ /^\d{7}.*$/ # 7 or more digits - epoch
|
69
|
+
return '%Y', 'annual' if example =~ /^\d{4}$/ # 4 digits
|
70
|
+
return '%Y', 'annual' if example =~ /^\d{4}\.0$/ # 4 digits with a dot 0 for excel
|
71
|
+
return ':year_quarter', 'quarterly' if example =~ /^\d{4}[Qq]\d$/ # 4 digits, Q, digit (here because the next pattern would override it)
|
72
|
+
return '%YM%m', 'monthly' if example =~ /^\d{4}M\d{1,2}$/ # 2007M08
|
73
|
+
return '%GW%V', 'weekly' if example =~ /^\d{4}W\d{1,2}$/ # 2012W01
|
74
|
+
return '%Y-%m', 'monthly' if example =~ /^\d{4}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits
|
75
|
+
return '%m-%Y', 'monthly' if example =~ /^\d{1,2}\D\d{4}$/ # 1-2 digits, separator, 4 digits
|
76
|
+
return '%Y%m', 'monthly' if example =~ /^\d{6}$/ # 6 digits
|
77
|
+
return '%Y-%b', 'monthly' if example =~ /^\d{4}\D\w{3}$/ # 4 digits, separator, 3 letters
|
78
|
+
return '%b-%Y', 'monthly' if example =~ /^\w{3}\D\d{4}$/ # 3 letters, separator, 4 digits
|
79
|
+
return '%b-%y', 'monthly' if example =~ /^\w{3}\D\d{2}$/ # 3 letters, separator, 2 digits
|
80
|
+
return '%Y%b', 'monthly' if example =~ /^\d{4}\w{3}$/ # 4 digits, 3 letters
|
81
|
+
return '%b%Y', 'monthly' if example =~ /^\w{3}\d{4}$/ # 3 letters, 4 digits
|
82
|
+
return '%Y-%b-%d', 'daily' if example =~ /^\d{4}\D\w{3}\D\d{1,2}$/ # 4 digits, separator, 3 letters, separator, 1-2 digits
|
83
|
+
return '%Y-%m-%d', 'daily' if example =~ /^\d{4}\D\d{1,2}\D\d{1,2}$/ # 4 digits, separator, 1-2 digits, separator, 1-2 digits
|
84
|
+
return '%d-%b-%Y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{4}$/ # 1-2 digits, separator, 3 letters, separator, 4 digits
|
85
|
+
return '%Y%b%d', 'daily' if example =~ /^\d{4}\w{3}\d{1,2}$/ # 4 digits, 3 letters, 1-2 digits
|
86
|
+
return '%d%b%Y', 'daily' if example =~ /^\d{1,2}\w{3}\d{4}$/ # 1-2 digits, 3 letters, 4 digits
|
87
|
+
return '%d-%b-%y', 'daily' if example =~ /^\d{1,2}\D\w{3}\D\d{2}$/ # 1-2 digits, 3 letters, 2 digits
|
88
|
+
return '%b-%d-%Y', 'daily' if example =~ /^\w{3}\D\d{1,2}\D{1,2}\d{4}$/ # 3 letters, separator, 1-2 digits, separator(s), 4 digits
|
89
|
+
|
90
|
+
#our custom formats
|
91
|
+
return ':year_quarter', 'quarterly' if example =~ /^\d{4}\D[Qq]\d$/ # 4 digits, separator, Q, digit
|
92
|
+
return ':excel-1900', 'daily' if example =~ /^\d{5}$/ # 5 digits
|
93
|
+
return ':excel-1900', 'daily' if example =~ /^\d{5}\.0$/ # 5 digits dot zero excel
|
94
|
+
|
95
|
+
# No, try default date parse
|
96
|
+
# raise PostProcessorException, "Unable to guess date format for #{example}"
|
97
|
+
[nil, nil]
|
98
|
+
end
|
99
|
+
|
100
|
+
def disinfect(dates)
|
101
|
+
[*dates].collect do |date|
|
102
|
+
date.to_s.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
103
|
+
date.to_s.gsub!(/[^\x01-\x7f]/,'')
|
104
|
+
date.to_s.strip.gsub(/\s\s+/, ' ')
|
105
|
+
end
|
106
|
+
end
|
107
|
+
private
|
108
|
+
|
109
|
+
|
110
|
+
#converts date to specified format
|
111
|
+
def convert(fuzzy_date, date_format)
|
112
|
+
if date_format.nil?
|
113
|
+
# Assuming a US date format with 3 parameters (i.e. MM?DD?YYYY)
|
114
|
+
tokens = fuzzy_date.split(/\D/)
|
115
|
+
if tokens[0].length > 2 || fuzzy_date =~ /\w{2}/
|
116
|
+
# Its ISO
|
117
|
+
return DateTime.parse(fuzzy_date.to_s).to_date
|
118
|
+
else
|
119
|
+
# Guessing US
|
120
|
+
return Date.new(tokens[2].to_i, tokens[0].to_i, tokens[1].to_i)
|
121
|
+
end
|
122
|
+
else
|
123
|
+
case date_format
|
124
|
+
when ':year_quarter'
|
125
|
+
return year_quarter_formatter(fuzzy_date)
|
126
|
+
when ':excel-1900'
|
127
|
+
return excel_1900_formatter(fuzzy_date)
|
128
|
+
else #regular ruby formatter
|
129
|
+
return regular_formatter(fuzzy_date, date_format)
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
def year_quarter_formatter(fuzzy_date)
|
137
|
+
raw_date = fuzzy_date
|
138
|
+
tokens = raw_date.gsub(/[qQ]/, '-').gsub(/[a-zA-Z]/, '').split(/[^0-9]/)
|
139
|
+
tokens.delete_if {|x| x.nil? || x.empty?} # In case there are more than one delimiter because we replaced the Q
|
140
|
+
Date.new(tokens[0].to_i, tokens[1].to_i * 3, 1)
|
141
|
+
end
|
142
|
+
|
143
|
+
def excel_1900_formatter(fuzzy_date)
|
144
|
+
# handle Lotus 123 bug has 1900 as a leap year
|
145
|
+
Date.civil(1899, 12, 31) + fuzzy_date.to_i - 1 if fuzzy_date.to_i > 0
|
146
|
+
end
|
147
|
+
|
148
|
+
def regular_formatter(fuzzy_date, date_format)
|
149
|
+
# We have a date format - oh so pretty, but...
|
150
|
+
date_string = fuzzy_date
|
151
|
+
# normalize delimiters to hyphens so we do not have to make a format for each one.
|
152
|
+
# delimiters can be letters when its all numbers and delimiters only when there are letters. Sigh.
|
153
|
+
# only if no format where provided
|
154
|
+
date_string = date_string.gsub(/[^\d\w]+/, '-') if @settings[:format].nil?
|
155
|
+
|
156
|
+
#epoch date string
|
157
|
+
if date_format == 'epoch'
|
158
|
+
news = Time.at(date_string.to_i).utc.to_s.match(/\d\d\d\d-\d\d-\d\d/)
|
159
|
+
formatted_date = DateTime.strptime(news.to_s, '%Y-%m-%d').to_date
|
160
|
+
else
|
161
|
+
if date_string.to_s =~ /^(\w{3})\D(\d{2})$/
|
162
|
+
century = $2.to_i < 25 ? '20' : '19'
|
163
|
+
date_string = "#{$1} #{century}#{$2}"
|
164
|
+
formatted_date = DateTime.strptime(date_string.to_s, '%b %Y').to_date
|
165
|
+
else
|
166
|
+
formatted_date = DateTime.strptime(date_string.to_s, date_format).to_date
|
167
|
+
end
|
168
|
+
end
|
169
|
+
formatted_date+=4 if date_format == '%GW%V' #strptime makes dates on Mondays. We want Fridays.
|
170
|
+
formatted_date
|
171
|
+
end
|
172
|
+
|
173
|
+
|
174
|
+
|
175
|
+
#find good example of date to use as template for format
|
176
|
+
def find_good_date(all_dates)
|
177
|
+
good_sample=nil
|
178
|
+
all_dates.each do |fuzzy_date|
|
179
|
+
if usable_cell(fuzzy_date)
|
180
|
+
good_sample = fuzzy_date
|
181
|
+
break
|
182
|
+
end
|
183
|
+
end
|
184
|
+
good_sample
|
185
|
+
end
|
186
|
+
|
187
|
+
def usable_cell(cell)
|
188
|
+
return false if cell.nil? || cell.to_s.empty?
|
189
|
+
return false if cell.to_s.size > 20 # even annotated date can't be bigger than 20
|
190
|
+
|
191
|
+
return true if cell.to_s =~ /^\w{3}\D[456789]\d$/
|
192
|
+
# date is not usable as an example if it is ambiguous as to day and month
|
193
|
+
# 03/04/2012, for example, is ambiguous. 03/17/2012 is NOT ambiguous
|
194
|
+
if re = cell.to_s.match(/^(\d{1,2})\D(\d{1,2})\D\d{2,4}/) # e.g. 03/04/2012
|
195
|
+
if re[1].to_i <= 12 and re[2].to_i <= 12
|
196
|
+
return false
|
197
|
+
else
|
198
|
+
return true
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
if re = cell.to_s.match(/^(\d{1,2})\D\w{3}\D(\d{2})/) # 07-jun-07
|
203
|
+
if re[1].to_i <= 12 and re[2].to_i <= 12
|
204
|
+
return false
|
205
|
+
else
|
206
|
+
return true
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
return true if cell.to_s =~ /\d{4}/ # It has a 4 digit year somewhere
|
211
|
+
|
212
|
+
return true if cell.to_s =~ /^\w{3}\D\d{2}/ # %b-%y(d)..also not ambiguous
|
213
|
+
|
214
|
+
false # Thank you, come again
|
215
|
+
end
|
216
|
+
|
217
|
+
# Bump date to the end of the respective periods
|
218
|
+
def frequency_transform(date, frequency)
|
219
|
+
case frequency
|
220
|
+
when 'annual'
|
221
|
+
date = Date.new(date.year,12,31)
|
222
|
+
when 'quarterly'
|
223
|
+
month = 3*((date.month-1)/3 + 1) # equals 3,6,9 or 12
|
224
|
+
date = Date.new(date.year, month, 1).next_month-1
|
225
|
+
when 'monthly'
|
226
|
+
date = Date.new(date.year, date.month,1).next_month-1
|
227
|
+
else
|
228
|
+
# Do nothing for daily or weekly
|
229
|
+
end
|
230
|
+
|
231
|
+
date
|
232
|
+
end
|
233
|
+
|
234
|
+
|
235
|
+
end
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module Quandl
|
2
|
+
module Babelfish
|
3
|
+
|
4
|
+
#responsible for number cleaning
|
5
|
+
class NumberMaid
|
6
|
+
@defaults = {
|
7
|
+
:decimal_mark => Regexp.escape('.'),
|
8
|
+
:ignore_brackets => false, # Brackets ARE negative by default
|
9
|
+
}
|
10
|
+
|
11
|
+
@settings = @defaults #init with defaults
|
12
|
+
|
13
|
+
class << self
|
14
|
+
|
15
|
+
|
16
|
+
def init(user_settings)
|
17
|
+
@settings=@defaults.merge(user_settings)
|
18
|
+
@escaped_decimal = Regexp.escape(@settings[:decimal_mark])
|
19
|
+
end
|
20
|
+
|
21
|
+
#cleans each number one by one
|
22
|
+
def clean(dirty_numbers)
|
23
|
+
return nil if dirty_numbers.nil?
|
24
|
+
numbers=[]
|
25
|
+
Array(dirty_numbers).each do |cell|
|
26
|
+
numbers << cell_to_number(cell.to_s)
|
27
|
+
end
|
28
|
+
|
29
|
+
(numbers.size == 1) ? numbers[0] : numbers
|
30
|
+
end
|
31
|
+
|
32
|
+
def cell_to_number(num)
|
33
|
+
return nil if num.nil?
|
34
|
+
# Remove annotations
|
35
|
+
# if there is something in parenthesis and a number elsewhere, nuke the parenthesis
|
36
|
+
temp = num.gsub(/[\(\[\{].*[\)\}\]]/, '')
|
37
|
+
num = temp if temp.match(/\d/)
|
38
|
+
|
39
|
+
num.gsub!("est.", '')
|
40
|
+
|
41
|
+
#check for exponents by searching for 'e' 'E' or variations of 'x 10' '*10' and 'X10^'
|
42
|
+
is_exp = false
|
43
|
+
expmultiplier = 1
|
44
|
+
m = /(\s)*(E|e|[X|x|\*](\s)*10(\^)?)(\s)*/.match(num)
|
45
|
+
#check if match is made, preceeded by a number/decimal, and succeeded by a digit or a plus/minus sign
|
46
|
+
if !m.nil? and m.pre_match =~ /[0-9#{@escaped_decimal}]$/ and m.post_match =~ /^([\-+0-9])/
|
47
|
+
is_exp = true
|
48
|
+
num = m.pre_match
|
49
|
+
expmultiplier = 10 ** /^[0-9\-+]*/.match(m.post_match)[0].to_i
|
50
|
+
end
|
51
|
+
is_million = (num =~ /million/i)
|
52
|
+
is_billion = (num =~ /billion/i)
|
53
|
+
is_negative = (num =~ /-[\d]/ or (!@settings[:ignore_brackets] and num =~ /\([\d]/))
|
54
|
+
|
55
|
+
# watch out for two numbers, like a range eg "27.3 - 33.9"
|
56
|
+
# how: if you a see a number followed by a non number char that is not the decimal marker, kill everything to the right of that
|
57
|
+
num.gsub!(/(\d) (\d)/, '\1\2')
|
58
|
+
if m = num.match(/-?\s*[,\d\.]+/)
|
59
|
+
num = m[0]
|
60
|
+
end
|
61
|
+
|
62
|
+
# only keep #s and decimal mark
|
63
|
+
num.gsub!(/[^0-9#{@escaped_decimal}]/, '')
|
64
|
+
num.gsub!(/[^0-9]/, '.')
|
65
|
+
|
66
|
+
return nil if num.nil? || num !~ /[\d]/
|
67
|
+
return nil if num.end_with?(".")
|
68
|
+
return nil if num.count(".") > 1
|
69
|
+
cell = num.nil? ? 0.0 : Float("%.#{14}g" % num)
|
70
|
+
cell *= 1e6 if is_million
|
71
|
+
cell *= 1e9 if is_billion
|
72
|
+
cell *= -1 if is_negative
|
73
|
+
cell *= expmultiplier if is_exp
|
74
|
+
cell
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require "quandl/babelfish/version"
|
2
|
+
|
3
|
+
require "quandl/babelfish/data"
|
4
|
+
require "quandl/babelfish/cleaner"
|
5
|
+
require "quandl/babelfish/date_maid"
|
6
|
+
require "quandl/babelfish/number_maid"
|
7
|
+
|
8
|
+
require 'quandl/error/guess_date_format'
|
9
|
+
require 'quandl/error/invalid_date'
|
10
|
+
require 'quandl/error/unknown_date_format'
|
11
|
+
|
12
|
+
module Quandl
|
13
|
+
module Babelfish
|
14
|
+
|
15
|
+
class << self
|
16
|
+
def clean(data, date_settings={}, number_settings={})
|
17
|
+
Cleaner::process data, date_settings, number_settings
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "quandl/babelfish/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "quandl_babelfish"
|
7
|
+
s.version = Quandl::Babelfish::VERSION
|
8
|
+
s.authors = ["Sergei Ryshkevich"]
|
9
|
+
s.email = ["sergei@quandl.com"]
|
10
|
+
s.homepage = "http://quandl.com/"
|
11
|
+
s.license = "MIT"
|
12
|
+
s.summary = "Quandl Data Cleaner"
|
13
|
+
s.description = "Quandl Data Cleaner"
|
14
|
+
|
15
|
+
s.files = `git ls-files`.split("\n")
|
16
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
|
19
|
+
s.add_development_dependency "rspec", "~> 2.13"
|
20
|
+
s.add_development_dependency "pry"
|
21
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
include Quandl::Babelfish
|
4
|
+
describe Cleaner do
|
5
|
+
|
6
|
+
let(:input){ [] }
|
7
|
+
let(:output){ Cleaner.process(input) }
|
8
|
+
subject{ output }
|
9
|
+
|
10
|
+
context "garbage" do
|
11
|
+
let(:input){ [[2456624, 10], [2456625, 20], [2456626, 30]] }
|
12
|
+
it{ should be_eq_at_index '[0][0]', Date.new(1970,01,29) }
|
13
|
+
end
|
14
|
+
|
15
|
+
context "headers with whitespace" do
|
16
|
+
let(:input){ [[" Date ", " C1 ", "C2 ", " C4"],[1990,1,2,3],[1991,4,5,6]] }
|
17
|
+
its(:headers){ should eq ["Date", "C1", "C2", "C4"] }
|
18
|
+
end
|
19
|
+
|
20
|
+
context "annual" do
|
21
|
+
let(:input){ [[1990,1,2,3],[1991,4,5,6]] }
|
22
|
+
it{ should be_eq_at_index '[0][0]', Date.new(1990,12,31) }
|
23
|
+
it{ should be_eq_at_index '[0][1]', 1 }
|
24
|
+
it{ should be_eq_at_index '[1][0]', Date.new(1991,12,31) }
|
25
|
+
it{ should be_eq_at_index '[1][3]', 6 }
|
26
|
+
its(:headers){ should be_nil }
|
27
|
+
end
|
28
|
+
|
29
|
+
context "numeric date" do
|
30
|
+
let(:input){ [[19900101,'1 [estimate]','2.3 - 4.0','not a number']] }
|
31
|
+
it{ should be_eq_at_index '[0][0]', Date.new(1990,01,01) }
|
32
|
+
it{ should be_eq_at_index '[0][1]', 1 }
|
33
|
+
it{ should be_eq_at_index '[0][2]', 2.3 }
|
34
|
+
it{ should be_eq_at_index '[0][3]', nil }
|
35
|
+
its(:headers){ should be_nil }
|
36
|
+
end
|
37
|
+
|
38
|
+
context "data with headers" do
|
39
|
+
let(:input){ [['Date',0,0,0],[19900101,'1 [estimate]','2.3 - 4.0','not a number']] }
|
40
|
+
it{ should be_eq_at_index '[0][0]', Date.new(1990,01,01) }
|
41
|
+
it{ should be_eq_at_index '[0][1]', 1 }
|
42
|
+
it{ should be_eq_at_index '[0][2]', 2.3 }
|
43
|
+
it{ should be_eq_at_index '[0][3]', nil }
|
44
|
+
its(:headers){ should eq ['Date','0','0','0'] }
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Quandl::Babelfish::Data do
|
4
|
+
let(:data_args){ [] }
|
5
|
+
subject{ Quandl::Babelfish::Data.new(*data_args) }
|
6
|
+
|
7
|
+
its(:to_a){ should eq [] }
|
8
|
+
its(:headers){ should be_nil }
|
9
|
+
|
10
|
+
context "given Array" do
|
11
|
+
let(:data_args){ [ [[1,2,3],[4,3,5]] ] }
|
12
|
+
its(:to_a){ should eq [[1,2,3],[4,3,5]] }
|
13
|
+
its(:headers){ should be_nil }
|
14
|
+
end
|
15
|
+
|
16
|
+
context "given Array with :headers" do
|
17
|
+
let(:data_args){ [ [[1,2,3],[4,3,5]], { headers: ['Date', 'C1', 'C2'] } ] }
|
18
|
+
its(:to_a){ should eq [[1,2,3],[4,3,5]] }
|
19
|
+
its(:headers){ should eq ['Date', 'C1', 'C2'] }
|
20
|
+
end
|
21
|
+
|
22
|
+
context "given junk headers: Float" do
|
23
|
+
let(:data_args){ [ 2, { headers: 1.2 } ] }
|
24
|
+
its(:to_a){ should eq [nil,nil] }
|
25
|
+
its(:headers){ should be_nil }
|
26
|
+
end
|
27
|
+
context "given junk headers: String" do
|
28
|
+
let(:data_args){ [ 2, { headers: '1.2' } ] }
|
29
|
+
its(:to_a){ should eq [nil,nil] }
|
30
|
+
its(:headers){ should be_nil }
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|