csv-import-analyzer 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 40f1ef2bfdbd829eaa64dfa88f360ee722d60228
|
4
|
+
data.tar.gz: c0beeb4085de093f7d41b79e3688cfe36a2b6811
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 94a4839a40f22301b36776b6155855f5bc49a162f3f6c3d9706c46b1105ecf4598c65969d76bc4fade7f1a2d9250fc32fe4fb3ce121657f18fd661bf71cf9ca5
|
7
|
+
data.tar.gz: e359dd8f516b2a96d3f799477975c2006639229ee40a9a9cccf18cca8893e16217c6f7ef1349df4ddd448dbbb7d2964c57854a5d7094808ab81b14fdebe8a4cf
|
data/README.md
CHANGED
@@ -1,10 +1,8 @@
|
|
1
1
|
# Csv::Import::Analyzer
|
2
2
|
|
3
|
-
|
4
|
-
Calculate min-max bounds for each column
|
5
|
-
Determine which coulmns are nullable in the csv file
|
3
|
+
CsvImportAnalyzer is intended to help perform data analysis on csv (comma seperated), tsv (tab seperated) or ssv (semi-colon seperated) files. It can be used to process large datasets in desired chunk sizes (defaults to 200 rows), gives you a comprehensive analysis on each column with possible datatype, minimum and manimum bounds, if the column can be set to nullable for each column.
|
6
4
|
|
7
|
-
Note
|
5
|
+
<b>Note</b>: This gem expects the first line to be definitve header, as in like column names if the csv file has to be imported to database.
|
8
6
|
|
9
7
|
## Installation
|
10
8
|
|
@@ -24,25 +22,117 @@ Or install it yourself as:
|
|
24
22
|
|
25
23
|
## Usage
|
26
24
|
|
27
|
-
Calling process on a filename would generate
|
25
|
+
Calling process on a filename would generate metadata for the sample file and return it as a json object. This metadata would have the following
|
26
|
+
<ul>
|
27
|
+
<li> High level stats for the given file (E.g. filename, file size, number of rows, number of columns).</li>
|
28
|
+
<li> Data manipulation done for pre-processing the file.</li>
|
29
|
+
<li> Data analysis on each column as key value pairs.</li>
|
30
|
+
<li> By default you would also have MySQL queries that you need to import the file to database.</li>
|
31
|
+
</ul>
|
32
|
+
```ruby
|
33
|
+
CsvImportAnalyzer.process(filename)
|
34
|
+
```
|
35
|
+
|
36
|
+
## Demo
|
37
|
+
|
38
|
+
Below is a sample test.csv file
|
39
|
+
|
40
|
+
```
|
41
|
+
Year ID,Make ID,Model ID,Description ID,Price ID
|
42
|
+
1997,Ford,E350,"ac, abs, moon","3000.00"
|
43
|
+
1999,Chevy,"Venture ""Extended Edition""",,4900.00
|
44
|
+
1999,"Chevy","Venture ""Extended Edition, Very Large""","",5000.00
|
45
|
+
1996,Jeep,Grand Che'rokee,"MUST SELL!air, moon roof, loaded",4799.00
|
46
|
+
```
|
47
|
+
To get the data analysis of above file, you can use CsvImportAnalyzer to process the file.
|
28
48
|
|
29
49
|
```ruby
|
50
|
+
metadata = CsvImportAnalyzer.process("test.csv", {:distinct => 2})
|
51
|
+
```
|
52
|
+
### Result
|
53
|
+
Now the metadata would hold the json object of the comprehensive analysis. Below is what the metadata would be for the sample csv file
|
54
|
+
```ruby
|
55
|
+
puts metadata
|
56
|
+
```
|
57
|
+
```json
|
58
|
+
{
|
59
|
+
"csv_file": {
|
60
|
+
"filename": "sampleTab.csv",
|
61
|
+
"file_size": 276,
|
62
|
+
"record_delimiter": ",",
|
63
|
+
"rows": 6,
|
64
|
+
"columns": 5,
|
65
|
+
"processed_filename": "processed_sampleTab.csv",
|
66
|
+
"processed_file_path": "/tmp/processed_sampleTab.csv",
|
67
|
+
"processed_file_size": 279,
|
68
|
+
"error_report": "/tmp/error_report_sampleTab.csv"
|
69
|
+
},
|
70
|
+
"data_manipulations": {
|
71
|
+
"replace_nulls": true,
|
72
|
+
"replace_quotes": true
|
73
|
+
},
|
74
|
+
"csv_headers": {
|
75
|
+
"year_id": {
|
76
|
+
"datatype": "int",
|
77
|
+
"datatype_analysis": {
|
78
|
+
"int": 4
|
79
|
+
},
|
80
|
+
"distinct_values": "2+"
|
81
|
+
},
|
82
|
+
"make_id": {
|
83
|
+
"datatype": "string",
|
84
|
+
"datatype_analysis": {
|
85
|
+
"string": 4
|
86
|
+
},
|
87
|
+
"distinct_values": "2+"
|
88
|
+
},
|
89
|
+
"model_id": {
|
90
|
+
"datatype": "string",
|
91
|
+
"datatype_analysis": {
|
92
|
+
"string": 4
|
93
|
+
},
|
94
|
+
"distinct_values": "2+"
|
95
|
+
},
|
96
|
+
"description_id": {
|
97
|
+
"datatype": "string",
|
98
|
+
"datatype_analysis": {
|
99
|
+
"string": 2
|
100
|
+
},
|
101
|
+
"distinct_values": [
|
102
|
+
"ac, abs, moon",
|
103
|
+
"MUST SELL!air, moon roof, loaded"
|
104
|
+
],
|
105
|
+
"nullable": true
|
106
|
+
},
|
107
|
+
"price_id": {
|
108
|
+
"datatype": "float",
|
109
|
+
"datatype_analysis": {
|
110
|
+
"float": 4
|
111
|
+
},
|
112
|
+
"distinct_values": "2+"
|
113
|
+
}
|
114
|
+
},
|
115
|
+
"sql": {
|
116
|
+
"mysql": {
|
117
|
+
"create_query": "create table processed_sampletab.csv ( year_id int not null, make_id varchar(255) not null, model_id varchar(255) not null, description_id varchar(255), price_id float not null);",
|
118
|
+
"import_query": "COPY processed_sampletab.csv FROM '/tmp/processed_sampleTab.csv' HEADER DELIMITER ',' CSV NULL AS 'NULL';"
|
119
|
+
}
|
120
|
+
}
|
121
|
+
}
|
30
122
|
|
31
|
-
CsvImportAnalyzer.process(filename)
|
32
|
-
|
33
123
|
```
|
34
124
|
|
35
125
|
## TODO:
|
36
126
|
<ul>
|
37
|
-
<li> Handle control of processed input file to user </li>
|
38
|
-
<li> Return the analysis as Json object.</li>
|
39
127
|
<li> Better - Structuring the analysis outputted to csv</li>
|
40
128
|
<li> Add support to convert and import xlsx files to csv </li>
|
129
|
+
<li> Handle control of processed input file to user </li>
|
41
130
|
</ul>
|
42
131
|
|
43
132
|
## Additional Information
|
44
133
|
|
45
134
|
### Dependencies
|
135
|
+
|
46
136
|
<ul><li><a href="https://github.com/tilo/smarter_csv">smarter_csv</a> - For processing the csv in chunks</li></ul>
|
47
137
|
|
48
138
|
## Contributing
|
@@ -52,3 +142,4 @@ Calling process on a filename would generate a metadata_output.json which has th
|
|
52
142
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
53
143
|
4. Push to the branch (`git push origin my-new-feature`)
|
54
144
|
5. Create a new Pull Request
|
145
|
+
|
@@ -69,12 +69,12 @@ module CsvImportAnalyzer
|
|
69
69
|
{
|
70
70
|
:metadata_output => nil, # To be set if metadata needs to be printed to a file
|
71
71
|
:processed_input => nil, # To be set if processed input is needed
|
72
|
-
:unique =>
|
72
|
+
:unique => 2, # Threshold for number of defaults values that needs to identified
|
73
73
|
:check_bounds => true, # Option to check for min - max bounds for each column [true => find the bounds]
|
74
74
|
:datatype_analysis => 200, # Number of rows to be sampled for datatype analysis
|
75
75
|
:chunk => 200, # Chunk size (no of rows) that needs to processed in-memory [Important not to load entire file into memory]
|
76
|
-
:database => [:
|
77
|
-
:quote_convert => true, # Convert
|
76
|
+
:database => [:mysql], # Databases for which schema needs to be generated
|
77
|
+
:quote_convert => true, # Convert single quotes to double quotes
|
78
78
|
:replace_nulls => true, # Replace nulls, empty's, nils, Null's with NULL
|
79
79
|
:out_format => :json # Set what type of output do you need as analysis
|
80
80
|
}
|
@@ -178,8 +178,8 @@ module CsvImportAnalyzer
|
|
178
178
|
columns[column_name] = {}
|
179
179
|
columns[column_name][:datatype] = header_datatypes[column_name]
|
180
180
|
columns[column_name][:datatype_analysis] = header_datatype_analysis[column_name]
|
181
|
-
if unique_values[column_name].size > max_distinct_values
|
182
|
-
columns[column_name][:distinct_values] = "#{max_distinct_values}+"
|
181
|
+
if unique_values[column_name].size > max_distinct_values - 1
|
182
|
+
columns[column_name][:distinct_values] = "#{max_distinct_values - 1}+"
|
183
183
|
else
|
184
184
|
columns[column_name][:distinct_values] = unique_values[column_name]
|
185
185
|
end
|