incsv 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +6 -0
- data/README.md +3 -3
- data/exe/incsv +4 -0
- data/incsv.gemspec +1 -0
- data/lib/incsv/column_type.rb +24 -0
- data/lib/incsv/database.rb +26 -0
- data/lib/incsv/schema.rb +8 -0
- data/lib/incsv/types/currency.rb +14 -0
- data/lib/incsv/types/date.rb +2 -0
- data/lib/incsv/types/string.rb +9 -0
- data/lib/incsv/version.rb +1 -1
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a48352c382c9de59e29eb0f294f75f68cc7f8de0
|
4
|
+
data.tar.gz: 749abf80150e9181ef61c0d85746ed685459a754
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e9b77efaf5628776671a0c113f8082335702f33ececab4e62c9f21cd1013d65bbc12f2be5b7e8f769cad9257ec2d5387ea6655b561a1803cc142613bc44e762f
|
7
|
+
data.tar.gz: 909a26477f6065c4a7ff04bc54f38e74c9a6fff4d32e94d73bb15b38f841ace0f120ac483b7b3e89daa9008272a117d64a9c72c4d3452694f455e865c3830aea
|
data/.yardopts
ADDED
data/README.md
CHANGED
@@ -7,8 +7,8 @@ It works by loading the CSV into an [SQLite][] database and then
|
|
7
7
|
dropping you into an interactive Ruby shell. You can then use the
|
8
8
|
[Sequel][] database library to perform further exploratory analysis.
|
9
9
|
|
10
|
-
[
|
11
|
-
[
|
10
|
+
[SQLite]: https://www.sqlite.org/
|
11
|
+
[Sequel]: http://sequel.jeremyevans.net/
|
12
12
|
|
13
13
|
## Installation
|
14
14
|
|
@@ -38,7 +38,7 @@ A quick example:
|
|
38
38
|
{:name=>"enhanced targeting card"},
|
39
39
|
{:name=>"Giddyup Buttercup"}]
|
40
40
|
|
41
|
-
[
|
41
|
+
[REPL]: https://en.wikipedia.org/wiki/Read%E2%80%93eval%E2%80%93print_loop
|
42
42
|
|
43
43
|
### The less-quick version
|
44
44
|
|
data/exe/incsv
CHANGED
@@ -8,6 +8,9 @@ require "pry"
|
|
8
8
|
require "incsv"
|
9
9
|
|
10
10
|
module InCSV
|
11
|
+
# A cut-down class, the binding of which is used for the REPL console.
|
12
|
+
# Any methods and instance variables defined here, therefore, are
|
13
|
+
# accessible from the console.
|
11
14
|
class Console
|
12
15
|
def initialize(db)
|
13
16
|
@db = db
|
@@ -18,6 +21,7 @@ module InCSV
|
|
18
21
|
end
|
19
22
|
end
|
20
23
|
|
24
|
+
# The command-line interface to InCSV.
|
21
25
|
class CLI < Thor
|
22
26
|
desc "create CSV_FILE", "Creates a database file with the appropriate schema for the given CSV file, but doesn't import any data."
|
23
27
|
method_option :force, type: :boolean, default: false
|
data/incsv.gemspec
CHANGED
@@ -22,6 +22,7 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.add_development_dependency "bundler", "~> 1.11"
|
23
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
24
24
|
spec.add_development_dependency "rspec", "~> 3.0"
|
25
|
+
spec.add_development_dependency "yard", "~> 0.8"
|
25
26
|
|
26
27
|
spec.add_runtime_dependency "thor", "~> 0.19.1"
|
27
28
|
spec.add_runtime_dependency "pry", "~> 0.10"
|
data/lib/incsv/column_type.rb
CHANGED
@@ -1,9 +1,28 @@
|
|
1
1
|
module InCSV
|
2
|
+
# An abstract class, inherited by all types of column. Specifies the
|
3
|
+
# interface that all these classes must adhere to.
|
2
4
|
class ColumnType
|
5
|
+
# A symbol representation of what type of data this ColumnType
|
6
|
+
# represents. By default this is taken from the class name (so this
|
7
|
+
# class would be :columntype).
|
3
8
|
def self.name
|
4
9
|
self.to_s.sub(/.*::/, "").downcase.to_sym
|
5
10
|
end
|
6
11
|
|
12
|
+
# The type of the column from the perspective of the database. By
|
13
|
+
# default this is the same as the class name, so a column of type
|
14
|
+
# String would go into the database as a :string.
|
15
|
+
#
|
16
|
+
# Possible column types can be found here:
|
17
|
+
#
|
18
|
+
# http://sequel.jeremyevans.net/rdoc/files/doc/schema_modification_rdoc.html#label-Column+types
|
19
|
+
#
|
20
|
+
# This can also be a string, for database-specific features or in
|
21
|
+
# order to specify lengths easily. Examples might be:
|
22
|
+
#
|
23
|
+
# VARCHAR(255)
|
24
|
+
# DECIMAL(10, 2)
|
25
|
+
# BOOLEAN
|
7
26
|
def self.for_database
|
8
27
|
self.to_s.sub(/.*::/, "").downcase.to_sym
|
9
28
|
end
|
@@ -12,14 +31,19 @@ module InCSV
|
|
12
31
|
@value = value
|
13
32
|
end
|
14
33
|
|
34
|
+
# Returns true if the given value (supplied in the constructor)
|
35
|
+
# is of the type represented by this column; returns false
|
36
|
+
# otherwise.
|
15
37
|
def match?
|
16
38
|
false
|
17
39
|
end
|
18
40
|
|
41
|
+
# Returns a cleaned/preprocessed version of the given value.
|
19
42
|
def clean_value
|
20
43
|
self.class.clean_value(@value)
|
21
44
|
end
|
22
45
|
|
46
|
+
# Returns a cleaned/preprocessed version of an arbitrary value.
|
23
47
|
def self.clean_value(value)
|
24
48
|
value
|
25
49
|
end
|
data/lib/incsv/database.rb
CHANGED
@@ -3,6 +3,9 @@ require "sequel"
|
|
3
3
|
require "pathname"
|
4
4
|
|
5
5
|
module InCSV
|
6
|
+
# Represents a database file, handling the creation of the database
|
7
|
+
# and of the table within the database, as well as the importing of
|
8
|
+
# data from a CSV file into the database.
|
6
9
|
class Database
|
7
10
|
def initialize(csv)
|
8
11
|
@csv = csv
|
@@ -14,29 +17,47 @@ module InCSV
|
|
14
17
|
|
15
18
|
attr_reader :db
|
16
19
|
|
20
|
+
# Returns true if the primary database table within the database has
|
21
|
+
# been created.
|
17
22
|
def table_created?
|
18
23
|
@db.table_exists?(table_name)
|
19
24
|
end
|
20
25
|
|
26
|
+
# Returns true if there is data in the primary table. There are
|
27
|
+
# perhaps more accurate ways to calculate this, but only by
|
28
|
+
# comparing samples from the CSV to the table; this is faster and
|
29
|
+
# will in practice be accurate.
|
21
30
|
def imported?
|
22
31
|
table_created? && @db[table_name].count > 0
|
23
32
|
end
|
24
33
|
|
34
|
+
# Returns true if the database file exists; makes no effort to check
|
35
|
+
# whether it is in fact a valid SQLite database.
|
25
36
|
def exists?
|
26
37
|
File.exist?(db_path)
|
27
38
|
end
|
28
39
|
|
40
|
+
# Returns the path to the database file, generated based on the
|
41
|
+
# filename of the CSV passed to the class. For example, a CSV called
|
42
|
+
# `products.csv` will be stored in a database called `products.db`
|
43
|
+
# in the same directory.
|
29
44
|
def db_path
|
30
45
|
path = Pathname(csv)
|
31
46
|
(path.dirname + (path.basename(".csv").to_s + ".db")).to_s
|
32
47
|
end
|
33
48
|
|
49
|
+
# Returns the table name, by default generated based on the filename
|
50
|
+
# of the CSV. For example, a CSV called `products.csv` will produce
|
51
|
+
# a table called `products`.
|
34
52
|
def table_name
|
35
53
|
@table_name ||= begin
|
36
54
|
File.basename(csv, ".csv").downcase.gsub(/[^a-z_]/, "").to_sym
|
37
55
|
end
|
38
56
|
end
|
39
57
|
|
58
|
+
# Creates a table in the database, with one column in the database
|
59
|
+
# for each column in the CSV, the type of which is the best guess
|
60
|
+
# for the data found in that column in the CSV data.
|
40
61
|
def create_table
|
41
62
|
@db.create_table!(table_name) do
|
42
63
|
primary_key :_incsv_id
|
@@ -49,6 +70,11 @@ module InCSV
|
|
49
70
|
end
|
50
71
|
end
|
51
72
|
|
73
|
+
# Imports data from the CSV file into the database, applying any
|
74
|
+
# preprocessing specified by the column type (e.g. stripping
|
75
|
+
# currency prefixes).
|
76
|
+
#
|
77
|
+
# Data is imported in transactions, in chunks of 200 rows at a time.
|
52
78
|
def import
|
53
79
|
return if imported?
|
54
80
|
|
data/lib/incsv/schema.rb
CHANGED
@@ -1,11 +1,15 @@
|
|
1
1
|
require "csv"
|
2
2
|
|
3
3
|
module InCSV
|
4
|
+
# Given a CSV file, samples data from it in order to establish what
|
5
|
+
# data types its columns are.
|
4
6
|
class Schema
|
5
7
|
def initialize(csv)
|
6
8
|
@csv = csv
|
7
9
|
end
|
8
10
|
|
11
|
+
# Returns the column types found in the CSV. Memoises the result, so
|
12
|
+
# can be called repeatedly.
|
9
13
|
def columns
|
10
14
|
@columns ||= parsed_columns
|
11
15
|
end
|
@@ -14,6 +18,10 @@ module InCSV
|
|
14
18
|
|
15
19
|
attr_reader :csv
|
16
20
|
|
21
|
+
# Returns an array with one element for each column in the CSV. The
|
22
|
+
# value is a Column object, which has responsibility for determining
|
23
|
+
# the type of the data stored in the column; a sample of 50 rows
|
24
|
+
# from the column is provided to the Column class for this purpose.
|
17
25
|
def parsed_columns
|
18
26
|
samples(50).map do |name, values|
|
19
27
|
Column.new(name, values)
|
data/lib/incsv/types/currency.rb
CHANGED
@@ -1,16 +1,30 @@
|
|
1
1
|
module InCSV
|
2
2
|
module Types
|
3
|
+
# Represents a currency value, without its symbol/identifier, stored
|
4
|
+
# as a DECIMAL(10, 2) to avoid rounding errors.
|
5
|
+
#
|
6
|
+
# Not storing the identifier is an issue that should be resolved at
|
7
|
+
# some point, ideally; it's obviously an issue in files that have
|
8
|
+
# multiple currencies in the same column.
|
3
9
|
class Currency < ColumnType
|
10
|
+
# A regular expression which matches all supported currency types.
|
4
11
|
MATCH_EXPRESSION = /\A(\$|£)([0-9,\.]+)\z/
|
5
12
|
|
13
|
+
# What type of column to create in the database.
|
6
14
|
def self.for_database
|
7
15
|
"DECIMAL(10,2)"
|
8
16
|
end
|
9
17
|
|
18
|
+
# Returns true if the given value is a supported currency type, or
|
19
|
+
# false otherwise.
|
10
20
|
def match?
|
11
21
|
value.strip.match(MATCH_EXPRESSION)
|
12
22
|
end
|
13
23
|
|
24
|
+
# Strip the currency symbol, and remove any comma separators. This
|
25
|
+
# creates an issue with locales other than English, in which
|
26
|
+
# commas are used for decimal points, but this will work for
|
27
|
+
# English.
|
14
28
|
def self.clean_value(value)
|
15
29
|
return unless value
|
16
30
|
|
data/lib/incsv/types/date.rb
CHANGED
data/lib/incsv/types/string.rb
CHANGED
@@ -1,6 +1,15 @@
|
|
1
1
|
module InCSV
|
2
2
|
module Types
|
3
|
+
# Represents a String, stored in the database as a VARCHAR(255).
|
4
|
+
# This is the fallback datatype, used for anything that doesn't
|
5
|
+
# match any of the other more specific types. Its matching logic is
|
6
|
+
# therefore simple: it matches anything. For this reason it must be
|
7
|
+
# matched last; this is achieved via require order.
|
3
8
|
class String < ColumnType
|
9
|
+
def self.for_database
|
10
|
+
"TEXT"
|
11
|
+
end
|
12
|
+
|
4
13
|
def match?
|
5
14
|
true
|
6
15
|
end
|
data/lib/incsv/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: incsv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rob Miller
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-03-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: yard
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0.8'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0.8'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: thor
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -120,6 +134,7 @@ files:
|
|
120
134
|
- ".gitignore"
|
121
135
|
- ".rspec"
|
122
136
|
- ".travis.yml"
|
137
|
+
- ".yardopts"
|
123
138
|
- CODE_OF_CONDUCT.md
|
124
139
|
- Gemfile
|
125
140
|
- LICENSE.txt
|
@@ -164,3 +179,4 @@ signing_key:
|
|
164
179
|
specification_version: 4
|
165
180
|
summary: A tool for interrogating CSV data using SQLite and Sequel.
|
166
181
|
test_files: []
|
182
|
+
has_rdoc:
|