rstore 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ # encoding: utf-8
2
+ require 'rstore/core_ext/object'
3
+
4
+ module RStore
5
+ class Configuration
6
+
7
+ class << self
8
+ attr_reader :default_file_options
9
+ attr_reader :default_parse_options
10
+ attr_reader :default_options
11
+ end
12
+
13
+
14
+ # Supported options
15
+ @default_file_options = {recursive: false, has_headers: true, selector: ''}
16
+ @default_parse_options = {row_sep: :auto, col_sep: ",", quote_char: '"', field_size_limit: nil, skip_blanks: false}.freeze
17
+ @default_options = @default_file_options.merge(@default_parse_options)
18
+
19
+
20
+ # Validations for RStore::CSV specific options
21
+ # @default_parse_options will not be validated here, as validation occurs on calling CSV.parse
22
+ Validations = Hash.new { |h,k| lambda { |value| true }}.
23
+ merge!({recursive: lambda { |value| value.boolean_or_nil? },
24
+ has_headers: lambda { |value| value.boolean_or_nil? },
25
+ selector: lambda { |value| value.is_a?(String) }})
26
+
27
+
28
+ attr_reader :options
29
+ attr_reader :file_options
30
+ attr_reader :parse_options
31
+ attr_reader :path
32
+
33
+
34
+ def initialize path, options
35
+ new_options = options.dup
36
+
37
+ @path = path
38
+ self.options = new_options
39
+ raise ArgumentError, arg_error_message(@path, new_options) if new_options.size > 0
40
+
41
+ @file_options = extract_with(Configuration.default_file_options)
42
+ @parse_options = extract_with(Configuration.default_parse_options)
43
+ end
44
+
45
+
46
+ def options= options
47
+
48
+ new_options = Configuration.default_options.merge(options)
49
+
50
+ result = new_options.inject({}) do |acc, (option, value)|
51
+ if Configuration.default_options.include?(option)
52
+ if Configuration.valid_value?(option, value)
53
+ acc[option] = value
54
+ options.delete(option)
55
+ else
56
+ raise ArgumentError, "path #{@path}: '#{value}' (#{value.class}) is not a valid value for option #{option.inspect}"
57
+ end
58
+ end
59
+ acc
60
+ end
61
+
62
+ @options = result
63
+ end
64
+
65
+
66
+ def extract_with options
67
+ keys = options.keys
68
+ @options.inject({}) do |acc, (option, value)|
69
+ if keys.include?(option)
70
+ acc[option] = value
71
+ end
72
+
73
+ acc
74
+ end
75
+ end
76
+
77
+
78
+ def self.change_default_options options
79
+ raise ArgumentError, "#{options} must be an instance of Hash" unless options.is_a?(Hash)
80
+ new_options = Configuration.default_options.merge(options)
81
+ raise ArgumentError, "#{options} contains unknown option key" if new_options.size > Configuration.default_options.size
82
+ new_options.each do |option, value|
83
+ error_message = "'#{value}' (#{value.class}) is not a valid value for option #{option.inspect}"
84
+ raise ArgumentError, error_message unless valid_value?(option, value)
85
+ end
86
+
87
+ @default_options = new_options
88
+ end
89
+
90
+
91
+ def self.reset_default_options
92
+ @default_options = @default_file_options.merge(@default_parse_options)
93
+ end
94
+
95
+
96
+ def [] key
97
+ target = instance_variables.find do |var|
98
+ var.to_s.gsub(/@/,'').to_sym == key
99
+ end
100
+ if target
101
+ instance_variable_get(target)
102
+ else
103
+ raise ArgumentError, "'#{key}' is not an instance variable"
104
+ end
105
+ end
106
+
107
+
108
+ # Helper methods
109
+ # ------------------------------------------
110
+
111
+
112
+ def self.valid_value? option, value
113
+ Validations[option][value]
114
+ end
115
+
116
+
117
+ def arg_error_message path, new_options
118
+ keys = new_options.keys.join(', ')
119
+ "Unsupported options: #{keys} for path '#{path}'"
120
+ end
121
+
122
+
123
+ end
124
+ end
125
+
126
+
@@ -0,0 +1,144 @@
1
+ # encoding: utf-8
2
+ require 'rstore/logger'
3
+ require 'bigdecimal'
4
+
5
+ module RStore
6
+ class Converter
7
+
8
+ # @return [Date]
9
+ attr_reader :data
10
+ # @return [Array<symbol>] Array of symbols representing the Ruby class set for each table column
11
+ attr_reader :column_types
12
+ # @return [Array<Boolean>] Array of boolean values indicating if NULL is allowed as a column value
13
+ attr_reader :allow_null
14
+ # @return [:symbol]
15
+ # On intitialization the only allowed value is :parsed.
16
+ # Will be set to :converted on successfull conversion.
17
+ attr_accessor :state
18
+
19
+
20
+ boolean_converter = lambda do |field|
21
+ if field.downcase == 'true' || field == '1'
22
+ return true
23
+ end
24
+ if field.downcase == 'false' || field == '0'
25
+ return false
26
+ else
27
+ raise ArgumentError, "invalid value for Boolean() '#{field}'"
28
+ end
29
+ end
30
+
31
+ # Converters used to verify the field data is valid.
32
+ # If a conversion fails, an exception is thrown together
33
+ # with a descriptive error message pointing to the field
34
+ # where the error occured.
35
+ Converters = Hash.new {|h,k| h[k] = lambda { |field| field }}.
36
+ merge!({string: lambda { |field| field },
37
+ date: lambda { |field| Date.parse(field).to_s },
38
+ datetime: lambda { |field| DateTime.parse(field).to_s },
39
+ # Convert to DateTime, because DateTime also checks if the argument is valid
40
+ time: lambda { |field| DateTime.parse(field).to_s },
41
+ integer: lambda { |field| Integer(field) },
42
+ float: lambda { |field| Float(field) },
43
+ numeric: lambda { |field| Float(field) }, # Handle Numeric as Float
44
+ # Check with Float first, then convert, because Float throws an error on invalid values such as 'x'.
45
+ bigdecimal: lambda { |field| Float(field); BigDecimal.new(field)},
46
+ boolean: lambda { |field| boolean_converter[field] }})
47
+
48
+
49
+ def initialize data_object, database, table_name
50
+ state = data_object.state
51
+ raise InvalidStateError, "#{state.inspect} is not a valid state for class Converter" unless state == :parsed
52
+ @data = data_object.clone
53
+ @state = @data.state
54
+ @schema = database.schema(table_name)
55
+ @column_types = extract_from_schema :type
56
+ @allow_null = extract_from_schema :allow_null
57
+ @error = false
58
+ end
59
+
60
+
61
+ def extract_from_schema target
62
+
63
+ schema = @schema.dup
64
+
65
+ # Delete primary key column entry
66
+ schema.delete_if do |(_, property_hash)|
67
+ property_hash[:primary_key] == true
68
+ end
69
+
70
+ schema.map do |(_, property_hash)|
71
+ # Sequel handles Time as Datetime:
72
+ type = property_hash[target]
73
+ #type = (type == :time) ? :datetime : type
74
+ type
75
+ end
76
+ end
77
+
78
+
79
+ # Returns @table with converted fields if no error is thrown, nil otherwise
80
+ def convert
81
+ content = @data.content.dup
82
+
83
+ converted = content.each_with_index.map do |row, row_index|
84
+
85
+ convert_row(row, row_index)
86
+ end
87
+ @state = :converted
88
+ Data.new(@data.path, converted, @state, @data.options)
89
+ end
90
+
91
+
92
+
93
+ def convert_row row, row_index
94
+ # CSV.parse adjusts the size of each row to equal the size of the longest row
95
+ # by adding nil where necessary.
96
+ error_message = <<-ERROR.gsub(/^\s+/,'')
97
+ Row length does not match number of columns. Please verify that:
98
+ 1. The database table fits the csv table data
99
+ 2. There is no primary key on a data column (you always need to
100
+ define a separate column for an auto-incrementing primary key)
101
+ ERROR
102
+
103
+ raise InvalidRowLengthError, error_message unless row.size == @column_types.size
104
+
105
+ begin
106
+ row.each_with_index.map do |field, field_index|
107
+ @field_index = field_index
108
+
109
+ if field.nil?
110
+ validate_null(@allow_null[field_index], field)
111
+ else
112
+ convert_type(@column_types[field_index], field)
113
+ end
114
+ end
115
+ rescue ArgumentError, NullNotAllowedError => e
116
+ logger = Logger.new(@data)
117
+ logger.log(:convert, e, row: row_index, col: @field_index)
118
+ logger.error
119
+ end
120
+
121
+ rescue InvalidRowLengthError => e
122
+ logger = Logger.new(@data)
123
+ logger.log(:convert, e, row: row_index)
124
+ logger.error
125
+ end
126
+
127
+
128
+ # Helper methods ---------------------------------
129
+
130
+
131
+
132
+ def convert_type column_type, field
133
+ Converters[column_type][field]
134
+ end
135
+
136
+
137
+ def validate_null allow_null, field
138
+ raise NullNotAllowedError, "NULL value (empty field) not allowed" unless allow_null == true
139
+ field
140
+ end
141
+
142
+ end
143
+ end
144
+
@@ -0,0 +1,7 @@
1
+ # encoding: utf-8
2
+
3
+ require 'csv'
4
+
5
+ # Wrapper around CSV to avoid name clashes inside RStore::CSV
6
+ class CSVWrapper < CSV
7
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ class Hash
4
+
5
+ def include_pairs? hash
6
+ if hash.empty?
7
+ return false
8
+ else
9
+ hash.all? { |key, val| self[key] == val }
10
+ end
11
+ end
12
+
13
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+
3
+ class Object
4
+
5
+ def boolean_or_nil?
6
+ return true if self.nil?
7
+ !!self == self
8
+ end
9
+
10
+ end
@@ -0,0 +1,42 @@
1
+ # encoding: utf-8
2
+ class String
3
+
4
+ def unquote
5
+ self.gsub(/(^"|"$)/,"").gsub(/""/,'"')
6
+ end
7
+
8
+ def is_i?
9
+ !!(self =~ /^[-+]?[0-9,]+$/)
10
+ end
11
+
12
+ # Checks if String represents a Float.
13
+ def is_f?
14
+ !!(self =~ /^[-+]?[0-9,]+\.[0-9]+$/)
15
+ end
16
+
17
+ def to_num
18
+ if self.is_f?
19
+ self.to_f
20
+ elsif self.is_i?
21
+ self.to_i
22
+ else
23
+ end
24
+ end
25
+
26
+
27
+ def url?
28
+ # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
29
+ url_regex = /^((?:https?:\/\/|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+
30
+ (?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))/x
31
+
32
+ !!(self =~ url_regex)
33
+
34
+ end
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+ end
data/lib/rstore/csv.rb ADDED
@@ -0,0 +1,288 @@
1
+ # encoding: utf-8
2
+ require 'open-uri'
3
+ require 'rstore/data'
4
+ require 'rstore/file_crawler'
5
+ require 'rstore/converter'
6
+ require 'rstore/configuration'
7
+ require 'rstore/base_db'
8
+ require 'rstore/base_table'
9
+ require 'rstore/core_ext/string'
10
+
11
+
12
+ module RStore
13
+ class CSV
14
+
15
+ #@return [BaseDB] a subclass of {RStore::BaseDB}
16
+ attr_reader :database
17
+ #@return [BaseTable] a sublcass of {RStore::BaseTable}
18
+ attr_reader :table
19
+ #@return [Array<Data>] holds `RStore::Data` objects that are used internally to store information from a data source.
20
+ attr_reader :data_array
21
+
22
+
23
+ # This constructor takes a block yielding an implicit instance of _self_.
24
+ # Within the block, the following methods need to be called:
25
+ #
26
+ # * {#from}
27
+ # * {#to}
28
+ # * {#run}
29
+ # @example
30
+ # RStore::CSV.new do
31
+ # from '../easter/children', :recursive => true # select a directory or
32
+ # from '../christmas/children/toys.csv' # file, or
33
+ # from 'www.example.com/sweets.csv', :selector => 'pre div.line' # URL
34
+ # to 'company.products' # provide database and table name
35
+ # run # run the program
36
+ # end
37
+ def initialize &block
38
+ @data_hash = {}
39
+ @data_array = []
40
+ @database = nil
41
+ @table = nil
42
+
43
+ # Tracking method calls to #from, #to, and #run.
44
+ @from = false
45
+ @to = false
46
+ @run = false
47
+
48
+ instance_eval(&block) if block_given?
49
+
50
+ end
51
+
52
+
53
+ # Specify the source of the csv file(s)
54
+ # There can be several calls to this method on given instance of `RStore::CSV`.
55
+ # This method has to be called before {#run}.
56
+ # @overload from(source, options)
57
+ # @param [String] source The relative or full path to a directory, file, or an URL
58
+ # @param [Hash] options The options used to customize fetching and parsing of csv data
59
+ # @option options [Boolean] :has_headers When set to false, the first line of a file is processed as data, otherwise it is discarded.
60
+ # (default: `true`)
61
+ # @option options [Boolean] :recursive When set to true and a directory is given, recursively search for files. Non-csv files are skipped.
62
+ # (default: `false`]
63
+ # @option options [String] :selector Mandatory css selector with an URL. Used the same syntax as Nokogiri, default: `""`
64
+ # @option options [String] :col_sep The String placed between each field. (default: `","`)
65
+ # @option options [String, Symbol] :row_sep The String appended to the end of each row.
66
+ # (default: `:auto`)
67
+ # @option options [String] :quote_car The character used to quote fields.
68
+ # (default: `'"'`)
69
+ # @option options [Integer, Nil] :field_size_limit The maximum size CSV will read ahead looking for the closing quote for a field.
70
+ # (default: `nil`)
71
+ # @option options [Boolean] :skip_blanks When set to a true value, CSV will skip over any rows with no content.
72
+ # (default: `false`)
73
+ # @overload from(source)
74
+ # @param [String] source The relative or full path to a directory, file, or an URL. The default options will be used.
75
+ # @return [void]
76
+ # @example
77
+ # store = RStore::CSV.new
78
+ # # fetching data from a file
79
+ # store.from '../christmas/children/toys.csv'
80
+ # # fetching data from a directory
81
+ # store.from '../easter/children', :recursive => true
82
+ # # fetching data from an URL
83
+ # store.from 'www.example.com/sweets.csv', :selector => 'pre div.line'
84
+ def from source, options={}
85
+ crawler = FileCrawler.new(source, :csv, options)
86
+ @data_hash.merge!(crawler.data_hash)
87
+ @from = true
88
+ end
89
+
90
+
91
+ # Choose the database table to store the csv data into.
92
+ # This method has to be called before {#run}.
93
+ # @param [String] db_table The names of the database and table, separated by a dot, e.g. 'database.table'.
94
+ # The name of the database has to correspond to a subclass of `RStore::BaseDB`:
95
+ # CompanyDB < RStore::BaseDB -> 'company'
96
+ # The name of the table has to correspond to a subclass of `RStore::BaseTable`:
97
+ # DataTable < RStore::BaseTable -> 'data'
98
+ # @return [void]
99
+ # @example
100
+ # store = RStore::CSV.new
101
+ # store.to('company.products')
102
+ def to db_table
103
+ @database, @table = CSV.database_table(db_table)
104
+ @to = true
105
+ end
106
+
107
+
108
+ #@private
109
+ def self.database_table db_table
110
+ raise ArgumentError, "The name of the database and table have to be separated with a dot (.)" unless delimiter_correct?(db_table)
111
+
112
+ db, tb = db_table.split('.')
113
+
114
+ database = BaseDB.db_classes[db.downcase.to_sym]
115
+ table = BaseTable.table_classes[tb.downcase.to_sym]
116
+
117
+ raise Exception, "Database '#{db}' not found" if database.nil?
118
+ raise Exception, "Table '#{tb}' not found" if table.nil?
119
+
120
+ [database, table]
121
+ end
122
+
123
+
124
+ # Start processing the csv files, storing the data into a database table.
125
+ # Both methods, {#from} and {#to}, have to be called before this method.
126
+ # @return [void]
127
+ def run
128
+ return if ran_once? # Ignore subsequent calls to #run
129
+ raise Exception, "At least one method 'from' has to be called before method 'run'" unless @from == true
130
+ raise Exception, "Method 'to' has to be called before method 'run'" unless @to == true
131
+
132
+ @data_hash.each do |path, data|
133
+ content = read_data(data)
134
+ @data_array << Data.new(path, content, :raw, data.options)
135
+ end
136
+
137
+ @database.connect do |db|
138
+
139
+ create_table(db)
140
+ name = @table.name
141
+
142
+ prepared_data_array = @data_array.map do |data|
143
+ data.parse_csv.convert_fields(db, name)
144
+ end
145
+
146
+ insert_all(prepared_data_array, db, name)
147
+
148
+ @run = true
149
+ message = <<-TEXT.gsub(/^\s+/, '')
150
+ ===============================
151
+ All data has been successfully inserted into table '#{database.name}.#{table.name}'"
152
+ -------------------------------
153
+ You can retrieve all table data with the following code:
154
+ -------------------------------
155
+ #{self.class}.query('#{database.name}.#{table.name}') do |table|
156
+ table.all
157
+ end
158
+ ===============================
159
+ TEXT
160
+ puts message
161
+ end
162
+ end
163
+
164
+
165
+ #@private
166
+ def insert_all data_stream, database, name
167
+ database.transaction do # outer transaction
168
+ data_stream.each do |data|
169
+ data.into_db(database, name)
170
+ end
171
+ end
172
+ end
173
+
174
+ private :insert_all
175
+
176
+
177
+ #@private
178
+ def read_data data_object
179
+ path = data_object.path
180
+ options = data_object.options
181
+
182
+ begin
183
+ if path.url?
184
+ require 'nokogiri'
185
+ doc = Nokogiri::HTML(open(path))
186
+ selector = options[:file_options][:selector]
187
+
188
+ content = doc.css(selector).inject("") do |result, link|
189
+ result << link.content << "\n"
190
+ result
191
+ end
192
+ else
193
+ content = File.read(path)
194
+ end
195
+
196
+ raise ArgumentError, "Empty content!" if content.empty?
197
+
198
+ rescue Exception => e
199
+ logger = Logger.new(data_object)
200
+ logger.log(:fetch, e)
201
+ logger.error
202
+ end
203
+
204
+ content
205
+ end
206
+
207
+
208
+ #@private
209
+ def create_table db
210
+
211
+ name = @table.name
212
+
213
+ if @database.connection_info.is_a?(Hash)
214
+ if @database.connection_info[:adapter] == 'mysql'
215
+ # http://sequel.rubyforge.org/rdoc/files/doc/release_notes/2_10_0_txt.html
216
+ Sequel::MySQL.default_engine = 'InnoDB'
217
+ # http://stackoverflow.com/questions/1671401/unable-to-output-mysql-tables-which-involve-dates-in-sequel
218
+ Sequel::MySQL.convert_invalid_date_time = nil
219
+ end
220
+ end
221
+
222
+ unless db.table_exists?(name)
223
+ db.create_table(name, &@table.table_info)
224
+ end
225
+
226
+ end
227
+
228
+
229
+ # Easy querying by yielding a {http://sequel.rubyforge.org/rdoc/files/doc/dataset_basics_rdoc.html Sequel::Dataset} instance of your table.
230
+ # @param [String] db_table The name of the database and table, separated by a dot.
231
+ # @return [void]
232
+ # @yieldparam [Sequel::Dataset] table The dataset of your table
233
+ # @example
234
+ # RStore::CSV.query('company.products') do |table| # table = Sequel::Dataset object
235
+ # table.all # fetch everything
236
+ # table.all[3] # fetch row number 4
237
+ # table.filter(:id => 2).update(:on_stock => true) # update entry
238
+ # table.filter(:id => 3).delete # delete entry
239
+ # end
240
+ def self.query db_table, &block
241
+ database, table = database_table(db_table)
242
+ database.connect do |db|
243
+ block.call(db[table.name]) if block_given? # Sequel::Dataset
244
+ end
245
+ end
246
+
247
+
248
+
249
+ #@private
250
+ def self.delimiter_correct? name
251
+ !!(name =~ /^[^\.]+\.[^\.]+$/)
252
+ end
253
+
254
+ # Test if the data has been inserted into the database table.
255
+ # @return [Boolean]
256
+ def ran_once?
257
+ @run == true
258
+ end
259
+
260
+
261
+ # Change default options recognized by {#from}
262
+ # The new option values apply to all following instances of `RStore::CSV`
263
+ # Options can be reset to their defaults by calling {.reset_default_options}
264
+ # See {#from} for a list of all options and their default values.
265
+ # @param [Hash] options Keys from default options with their respective new values.
266
+ # @return [void]
267
+ # @example
268
+ # # Search directories recursively and handle the first row of a file as data by default
269
+ # RStore::CSV.change_default_options(:recursive => true, :has_headers => false)
270
+ def self.change_default_options options
271
+ Configuration.change_default_options(options)
272
+ end
273
+
274
+
275
+
276
+ # Reset the options recognized by {#from} to their default values.
277
+ # @return [void]
278
+ # @example
279
+ # RStore::CSV.reset_default_options
280
+ def self.reset_default_options
281
+ Configuration.reset_default_options
282
+ end
283
+
284
+
285
+ end
286
+ end
287
+
288
+