rstore 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.md +253 -0
- data/Rakefile +40 -0
- data/lib/rstore.rb +5 -0
- data/lib/rstore/base_db.rb +119 -0
- data/lib/rstore/base_table.rb +92 -0
- data/lib/rstore/configuration.rb +126 -0
- data/lib/rstore/converter.rb +144 -0
- data/lib/rstore/core_ext/csv_wrapper.rb +7 -0
- data/lib/rstore/core_ext/hash.rb +13 -0
- data/lib/rstore/core_ext/object.rb +10 -0
- data/lib/rstore/core_ext/string.rb +42 -0
- data/lib/rstore/csv.rb +288 -0
- data/lib/rstore/data.rb +80 -0
- data/lib/rstore/exceptions.rb +11 -0
- data/lib/rstore/file_crawler.rb +135 -0
- data/lib/rstore/logger.rb +104 -0
- data/lib/rstore/modules/helper_methods.rb +14 -0
- data/lib/rstore/storage.rb +71 -0
- data/lib/rstore/version.rb +3 -0
- metadata +103 -0
data/lib/rstore/data.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'rstore/converter'
|
5
|
+
require 'rstore/storage'
|
6
|
+
require 'rstore/core_ext/object'
|
7
|
+
require 'rstore/core_ext/csv_wrapper'
|
8
|
+
|
9
|
+
module RStore
|
10
|
+
class Data
|
11
|
+
|
12
|
+
attr_reader :path
|
13
|
+
attr_reader :content
|
14
|
+
attr_reader :state
|
15
|
+
attr_reader :options
|
16
|
+
|
17
|
+
|
18
|
+
KnownStates = [:raw, :parsed, :converted, :error]
|
19
|
+
|
20
|
+
|
21
|
+
def initialize path, content, state, options
|
22
|
+
error_message = "#{path}: The following options are not valid as an argument to #{self.class}:\n#{options}"
|
23
|
+
raise ArgumentError, error_message unless options.is_a?(Hash)
|
24
|
+
@path = path
|
25
|
+
@content = content
|
26
|
+
self.state = state
|
27
|
+
@options = options
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
#def extract_type path
|
32
|
+
# path, filename = File.split(path)
|
33
|
+
# filename.match(/\.(?<type>.*)$/)[:type].to_sym
|
34
|
+
#end
|
35
|
+
|
36
|
+
def parse_csv
|
37
|
+
raise InvalidStateError, "#{state.inspect} is not a valid Data state for method 'to_csv'" unless state == :raw
|
38
|
+
|
39
|
+
file_options = @options[:file_options]
|
40
|
+
parse_options = @options[:parse_options]
|
41
|
+
|
42
|
+
begin
|
43
|
+
csv = CSVWrapper.parse(@content, parse_options)
|
44
|
+
csv = csv.drop(1) if file_options[:has_headers] == true # drop the first row if it is a header
|
45
|
+
rescue => e
|
46
|
+
Logger.new(@options).print(@data.path, :parse, e)
|
47
|
+
end
|
48
|
+
|
49
|
+
@state = :parsed
|
50
|
+
Data.new(@path, csv, @state, @options)
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
def convert_fields database, table_name
|
56
|
+
converter = Converter.new(self, database, table_name)
|
57
|
+
converter.convert
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
def into_db database, table_name
|
62
|
+
Storage.new(self, database, table_name).insert
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def state= state
|
67
|
+
error_message = "#{state.inspect} is not a valid state. The following states are valid: #{print_valid_states}"
|
68
|
+
raise ArgumentError, error_message unless KnownStates.include?(state)
|
69
|
+
@state = state
|
70
|
+
end
|
71
|
+
|
72
|
+
# Helper methods --------------------------------
|
73
|
+
|
74
|
+
def print_valid_states
|
75
|
+
KnownStates.map { |s| s.inspect }.join(', ')
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module RStore
|
4
|
+
# The error thrown when the length of a row does not fit the number of columns in the db table.
|
5
|
+
class InvalidRowLengthError < StandardError; end
|
6
|
+
class NullNotAllowedError < StandardError; end
|
7
|
+
class InvalidStateError < StandardError; end
|
8
|
+
class FileProcessingError < StandardError; end
|
9
|
+
|
10
|
+
end
|
11
|
+
|
@@ -0,0 +1,135 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'open-uri'
|
4
|
+
require 'rstore/configuration'
|
5
|
+
require 'rstore/data'
|
6
|
+
require 'rstore/core_ext/string'
|
7
|
+
|
8
|
+
module RStore
|
9
|
+
class FileCrawler
|
10
|
+
|
11
|
+
#attr_reader :file_options_hash
|
12
|
+
attr_reader :data_hash
|
13
|
+
|
14
|
+
attr_reader :file_options, :parse_options
|
15
|
+
attr_reader :path
|
16
|
+
attr_reader :file_paths, :file_type
|
17
|
+
attr_reader :config
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
def initialize file_or_folder, file_type, options={}
|
22
|
+
@path = file_or_folder
|
23
|
+
@file_type = file_type
|
24
|
+
@config = Configuration.new(file_or_folder, options)
|
25
|
+
@file_options = @config.file_options
|
26
|
+
@parse_options = @config.parse_options
|
27
|
+
self.file_paths = @path
|
28
|
+
self.file_options_hash = @file_paths
|
29
|
+
self.data_hash = @file_options_hash
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def file_paths= path
|
34
|
+
return @file_paths unless @file_paths.nil?
|
35
|
+
|
36
|
+
@file_paths = []
|
37
|
+
files = []
|
38
|
+
if path.url?
|
39
|
+
return @file_paths << verify_and_format_url(path)
|
40
|
+
elsif File.directory?(File.expand_path(path)) # Directory
|
41
|
+
Dir.chdir(path) do # Change current directory to 'path'.
|
42
|
+
parse_directory(@file_options[:recursive]).each do |f|
|
43
|
+
files << File.expand_path(f)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
else # Either a file or a non-existing directory path
|
47
|
+
file = File.expand_path(path)
|
48
|
+
raise ArgumentError, "'#{path}' is not a valid path" unless File.exists?(file)
|
49
|
+
|
50
|
+
error_message = <<-MESSAGE.gsub(/^\s+/,'')
|
51
|
+
Not a #{@file_type} file.
|
52
|
+
NOTE: Non-#{@file_type} files in a directory path
|
53
|
+
are silently skipped WITHOUT raising an exception
|
54
|
+
MESSAGE
|
55
|
+
|
56
|
+
raise ArgumentError, error_message unless can_read?(path)
|
57
|
+
|
58
|
+
files << file
|
59
|
+
end
|
60
|
+
|
61
|
+
@file_paths = files
|
62
|
+
rescue Exception => e
|
63
|
+
# Dirty hack to be able to call instantiate Logger.
|
64
|
+
data = Data.new(path, '', :raw, Configuration.default_options)
|
65
|
+
|
66
|
+
logger = Logger.new(data)
|
67
|
+
logger.log(:fetch, e)
|
68
|
+
logger.error
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
def data_hash= options_hash
|
73
|
+
hash = Hash[options_hash.map do |path, options|
|
74
|
+
data = Data.new(path, '', :raw, options)
|
75
|
+
[path, data]
|
76
|
+
end]
|
77
|
+
@data_hash = hash
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
def file_options_hash= file_paths
|
82
|
+
@file_options_hash unless @file_options_hash.nil?
|
83
|
+
|
84
|
+
hash = Hash.new {|h,k| h[k] = Hash.new {|h,k| h[k] = nil}}
|
85
|
+
file_paths.each do |path|
|
86
|
+
hash[path][:file_options] = @file_options
|
87
|
+
hash[path][:parse_options] = @parse_options
|
88
|
+
end
|
89
|
+
@file_options_hash = hash
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
def parse_directory option
|
94
|
+
files = []
|
95
|
+
if option
|
96
|
+
files = Dir.glob("**/*.{#{@file_type}}") # Recursively read files into array, skip files that are not of @file_type
|
97
|
+
else
|
98
|
+
files = Dir.glob("*.{#{@file_type}}") # Read files of the current directory
|
99
|
+
end
|
100
|
+
files.each do |file|
|
101
|
+
next if File.directory? file
|
102
|
+
file
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
|
107
|
+
# Helper methods ---------------------------
|
108
|
+
|
109
|
+
|
110
|
+
def can_read? path
|
111
|
+
!!(/.*\.#{@file_type.to_s}$/ =~ path)
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
def verify_and_format_url url
|
116
|
+
address = url
|
117
|
+
begin # add additional 'begin' block so that we can return the original, unchanged url in the error message.
|
118
|
+
open(address)
|
119
|
+
address
|
120
|
+
rescue
|
121
|
+
case address
|
122
|
+
when /^www/ # open-uri does not recognize URLs starting with 'www'
|
123
|
+
address = 'http://' + address
|
124
|
+
retry
|
125
|
+
when /^http:/ # open-uri does not redirect from http to https on a valid https URL
|
126
|
+
address = address.gsub(/http/,'https')
|
127
|
+
retry
|
128
|
+
else
|
129
|
+
raise ArgumentError, "Could not connect to #{url}. Please check if this URL is correct."
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rstore/exceptions'
|
4
|
+
|
5
|
+
module RStore
|
6
|
+
class Logger
|
7
|
+
|
8
|
+
attr_accessor :data
|
9
|
+
attr_accessor :message
|
10
|
+
|
11
|
+
|
12
|
+
KnownStates =
|
13
|
+
{:fetch => "loading files",
|
14
|
+
:parse => "parsing file content",
|
15
|
+
:convert => "converting field values into their corresponding datatypes",
|
16
|
+
:store => "storing file content into database"}
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
def initialize data_object
|
21
|
+
@data = data_object
|
22
|
+
@message = ''
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
def log state, error, loc={}
|
27
|
+
raise ArgumentError "#{state} is an invalid state vor #{self.class}" unless valid_state? state
|
28
|
+
|
29
|
+
loc = correct_location(loc)
|
30
|
+
|
31
|
+
type_of_error = error.class
|
32
|
+
error_message = error.to_s
|
33
|
+
location = "Location : #{location_to_s(loc)}"
|
34
|
+
location = loc.empty? ? '' : location
|
35
|
+
|
36
|
+
report = <<-TEXT.gsub(/^\s+/, '')
|
37
|
+
An error occured while #{KnownStates[state]}:
|
38
|
+
File : #{@data.path}
|
39
|
+
Type of error: #{type_of_error}
|
40
|
+
Error message: #{error_message}
|
41
|
+
#{location}
|
42
|
+
=============
|
43
|
+
Please fix the error and run again.
|
44
|
+
NOTE: No data has been inserted into the database yet.
|
45
|
+
=============
|
46
|
+
TEXT
|
47
|
+
|
48
|
+
@message = report
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
def error
|
53
|
+
raise FileProcessingError, @message
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
# Helper methods ------------------------
|
58
|
+
|
59
|
+
def location_to_s location
|
60
|
+
location.map { |loc,val| "#{loc} #{val}" }.join(', ')
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
def correct_location location
|
66
|
+
|
67
|
+
if location[:row] # row_index
|
68
|
+
row = correct_row(location[:row])
|
69
|
+
if location[:col] # col_index
|
70
|
+
col = location[:col]+1
|
71
|
+
{row: row, col: col}
|
72
|
+
else
|
73
|
+
{row: row}
|
74
|
+
end
|
75
|
+
else
|
76
|
+
location
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
def correct_row row
|
82
|
+
# row = row_index, which starts at 0
|
83
|
+
# Without headers: add 1 to row
|
84
|
+
# With headers : add another 1 to row as the header row had been already removed
|
85
|
+
row = with_headers? ? row+2 : row+1
|
86
|
+
row
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
def valid_state? state
|
91
|
+
KnownStates.keys.any? { |val| val == state }
|
92
|
+
end
|
93
|
+
|
94
|
+
|
95
|
+
def with_headers?
|
96
|
+
@data.options[:has_headers]
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module RStore
|
4
|
+
module HelperMethods
|
5
|
+
|
6
|
+
# Calulate primary key from schema
|
7
|
+
def p_key schema
|
8
|
+
schema.map do |(col_name, col_properties)|
|
9
|
+
col_name if col_properties[:primary_key] == true
|
10
|
+
end.compact.first
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'sequel'
|
4
|
+
require 'rstore/data'
|
5
|
+
require 'rstore/logger'
|
6
|
+
require 'rstore/exceptions'
|
7
|
+
require 'rstore/modules/helper_methods'
|
8
|
+
|
9
|
+
module RStore
|
10
|
+
class Storage
|
11
|
+
include HelperMethods
|
12
|
+
|
13
|
+
attr_accessor :data, :db, :table, :prepared_data, :primary_key
|
14
|
+
attr_accessor :state
|
15
|
+
|
16
|
+
|
17
|
+
def initialize data_object, database, table_name
|
18
|
+
state = data_object.state
|
19
|
+
raise InvalidStateError, "#{state.inspect} is not a valid state on initialization for class Storage" unless state == :converted
|
20
|
+
@state = state
|
21
|
+
@data = data_object.clone
|
22
|
+
@db = database
|
23
|
+
@table = table_name
|
24
|
+
@schema = @db.schema(@table)
|
25
|
+
@primary_key = p_key @schema
|
26
|
+
@prepared_data = prepare_data
|
27
|
+
end
|
28
|
+
|
29
|
+
|
30
|
+
def column_names
|
31
|
+
@schema.map do |(col_name, col_properties)|
|
32
|
+
col_name unless col_name == @primary_key
|
33
|
+
end.compact
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def prepare_data
|
38
|
+
col_names = column_names
|
39
|
+
@data.content.map do |row|
|
40
|
+
Hash[col_names.zip(row)]
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
45
|
+
def insert
|
46
|
+
dataset = @db[@table]
|
47
|
+
begin
|
48
|
+
@db.transaction do
|
49
|
+
@prepared_data.each_with_index do |row, row_index|
|
50
|
+
@row_index = row_index
|
51
|
+
dataset.insert(row)
|
52
|
+
# Sequel often only throws an exception when retrieving an incorrect record,
|
53
|
+
# The following therefore is to catch invalid data of data types that are
|
54
|
+
# not checked by RStore::Converter
|
55
|
+
dataset.order(@primary_key).last
|
56
|
+
end
|
57
|
+
end
|
58
|
+
rescue Exception => e
|
59
|
+
logger = Logger.new(@data)
|
60
|
+
logger.log(:store, e, row: @row_index)
|
61
|
+
logger.error
|
62
|
+
|
63
|
+
end
|
64
|
+
@state = :stored
|
65
|
+
@state
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
|
metadata
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rstore
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Stefan Rohlfing
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-10-27 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &17766760 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *17766760
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rspec
|
27
|
+
requirement: &17766300 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *17766300
|
36
|
+
description: ! '+ Batch processing of csv files
|
37
|
+
|
38
|
+
+ Fetches data from different sources: files, directories, URLs
|
39
|
+
|
40
|
+
+ Customizable using additional options
|
41
|
+
|
42
|
+
+ Validation of field values. At the moment validation of the following types is
|
43
|
+
supported
|
44
|
+
|
45
|
+
+ Descriptive error messages pointing helping you to find any invalid data quickly
|
46
|
+
|
47
|
+
+ Safe and transparent data storage:
|
48
|
+
|
49
|
+
+ -- Using database transactions: Either the data from all files is stored or none
|
50
|
+
|
51
|
+
+ -- The data storage method can only be executed once for every instance of RStore::CSV
|
52
|
+
|
53
|
+
'
|
54
|
+
email: stefan.rohlfing@gmail.com
|
55
|
+
executables: []
|
56
|
+
extensions: []
|
57
|
+
extra_rdoc_files: []
|
58
|
+
files:
|
59
|
+
- lib/rstore/version.rb
|
60
|
+
- lib/rstore/csv.rb
|
61
|
+
- lib/rstore/configuration.rb
|
62
|
+
- lib/rstore/converter.rb
|
63
|
+
- lib/rstore/base_db.rb
|
64
|
+
- lib/rstore/data.rb
|
65
|
+
- lib/rstore/file_crawler.rb
|
66
|
+
- lib/rstore/logger.rb
|
67
|
+
- lib/rstore/storage.rb
|
68
|
+
- lib/rstore/base_table.rb
|
69
|
+
- lib/rstore/core_ext/hash.rb
|
70
|
+
- lib/rstore/core_ext/csv_wrapper.rb
|
71
|
+
- lib/rstore/core_ext/string.rb
|
72
|
+
- lib/rstore/core_ext/object.rb
|
73
|
+
- lib/rstore/exceptions.rb
|
74
|
+
- lib/rstore/modules/helper_methods.rb
|
75
|
+
- lib/rstore.rb
|
76
|
+
- README.md
|
77
|
+
- Rakefile
|
78
|
+
- LICENSE
|
79
|
+
homepage: http://github.com/bytesource/rstore
|
80
|
+
licenses: []
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
require_paths:
|
84
|
+
- lib
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ! '>='
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: 1.9.1
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
requirements: []
|
98
|
+
rubyforge_project: rstore
|
99
|
+
rubygems_version: 1.8.10
|
100
|
+
signing_key:
|
101
|
+
specification_version: 3
|
102
|
+
summary: RStore - A library for easy batch storage of csv data into a database
|
103
|
+
test_files: []
|