table_importer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +7 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE +21 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +9 -0
  8. data/Rakefile +2 -0
  9. data/lib/table_importer/copy_and_paste.rb +118 -0
  10. data/lib/table_importer/csv.rb +146 -0
  11. data/lib/table_importer/excel.rb +92 -0
  12. data/lib/table_importer/exceptions.rb +29 -0
  13. data/lib/table_importer/source.rb +149 -0
  14. data/lib/table_importer/version.rb +3 -0
  15. data/lib/table_importer.rb +3 -0
  16. data/spec/copy_and_paste_spec.rb +134 -0
  17. data/spec/csv_spec.rb +135 -0
  18. data/spec/excel_spec.rb +139 -0
  19. data/spec/files/csv/10-1359651839-google_(1).csv +0 -0
  20. data/spec/files/csv/11-1359651879-contacts_(1) (1).csv +158 -0
  21. data/spec/files/csv/11-1359651879-contacts_(1).csv +158 -0
  22. data/spec/files/csv/6-1359649307-contacts (1).csv +157 -0
  23. data/spec/files/csv/6-1359649307-contacts (2).csv +158 -0
  24. data/spec/files/csv/6-1359649307-contacts (3).csv +158 -0
  25. data/spec/files/csv/6-1359649307-contacts.csv +158 -0
  26. data/spec/files/csv/7-1359650836-6-1359649307-contacts.csv +158 -0
  27. data/spec/files/csv/8-1359651745-contacts.csv +158 -0
  28. data/spec/files/csv/9-1359651826-google_(1).csv +0 -0
  29. data/spec/files/csv/bad_headers_2.csv +45 -0
  30. data/spec/files/csv/csv_headers.csv +55 -0
  31. data/spec/files/csv/csv_no_headers.csv +5 -0
  32. data/spec/files/csv/edge_cases.csv +16 -0
  33. data/spec/files/csv/hong_kong.csv +1150 -0
  34. data/spec/files/csv/hong_kong_no_headers.csv +9 -0
  35. data/spec/files/csv/hong_kong_small.csv +10 -0
  36. data/spec/files/csv/mexico2013_pressdoc.csv +3248 -0
  37. data/spec/files/csv/no_content.csv +22 -0
  38. data/spec/files/csv/semicolon.csv +214 -0
  39. data/spec/files/csv/with_headers.csv +10 -0
  40. data/spec/files/csv/with_headers_large.csv +10760 -0
  41. data/spec/files/csv/without_headers.csv +9 -0
  42. data/spec/files/excel/edge_cases.xls +0 -0
  43. data/spec/files/excel/no_content.xls +0 -0
  44. data/spec/files/excel/no_content.xlsx +0 -0
  45. data/spec/files/excel/with_headers.xls +0 -0
  46. data/spec/files/excel/with_headers_large.xls +0 -0
  47. data/spec/files/excel/with_headers_large.xlsx +0 -0
  48. data/spec/files/excel/without_headers.xls +0 -0
  49. data/spec/spec_helper.rb +20 -0
  50. data/table_importer.gemspec +32 -0
  51. data/tasks/rspec.rake +4 -0
  52. metadata +254 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a17aaa63e9aa58fb38435c0b2a205bbbfa63af1e
4
+ data.tar.gz: bd75fd7ac865182feb9d0ff6ab7c0d4e18af35a2
5
+ SHA512:
6
+ metadata.gz: 19aca54d1033e6bb488482f5a8506b902ef6f73bfd80f1539102710b87ef506fb319ab9252daa271e37635ccf26127c0a4a3c1336ec8f5b211f32357c7a5e6fc
7
+ data.tar.gz: 27b5e4f763ee14972c3fddb2f69ea84d111a097069ab5605bf50df40f55120480bef0ecbcc42147880ffd073470e4d9fbec06d9d4129b87dbb76b9e06efbe918
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ language: ruby
2
+ rvm:
3
+ - "1.9.3"
4
+ - "2.0.0"
5
+ - "2.1.1"
6
+ - "2.1.2"
7
+ - jruby-19mode # JRuby in 1.9 mode
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in table_importer.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 pr.co
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,9 @@
1
+ [![Build Status](https://travis-ci.org/pressdoc/table_importer.svg?branch=master)](https://travis-ci.org/pressdoc/table_importer)[![Coverage Status](https://coveralls.io/repos/pressdoc/table_importer/badge.png?branch=master)](https://coveralls.io/r/pressdoc/table_importer?branch=master)
2
+
3
+ ==============
4
+ Table Importer
5
+ ==============
6
+
7
+ Given a file (or a string) containing a container, along with options, it will return a hash of those values. Great for importing poorly formatted CSV files.
8
+
9
+ Only works for ruby versions >= 1.9.3.
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ Dir.glob('tasks/**/*.rake').each(&method(:import))
@@ -0,0 +1,118 @@
1
+ module TableImporter
2
+
3
+ class CopyAndPaste < Source
4
+
5
+ def initialize(data)
6
+ @data = assign_data(data[:content])
7
+ @column_separator, @record_separator = assign_separators(data[:col_sep], data[:rec_sep])
8
+ @headers, @headers_present = assign_headers(data[:headers], data[:headers_present])
9
+ @compulsory_headers = data[:compulsory_headers]
10
+ @delete_empty_columns = @data.length < 50000
11
+ end
12
+
13
+ def assign_data(content)
14
+ raise Exceptions::EmptyFileImportError.new if content.blank? || content[0..100].gsub(/[^A-Za-z0-9]/, '').blank?
15
+ content.gsub!(/\r\n|\r/, "\n")
16
+ return content
17
+ end
18
+
19
+ def assign_separators(col_sep, rec_sep)
20
+ col_sep = SEPARATORS[col_sep.to_sym] if !col_sep.nil?
21
+ rec_sep = SEPARATORS[rec_sep.to_sym] if !rec_sep.nil?
22
+ col_sep, rec_sep = data_conforms_pattern(col_sep, rec_sep)
23
+ end
24
+
25
+ def data_conforms_pattern(col_sep, rec_sep)
26
+ # Check to see if data is of bcc style
27
+ first_item = @data.split(",").first
28
+ if first_item.present? && first_item.match(/\S@\S/)
29
+ if first_item.match(/<(\S+@\S+)/)
30
+ rec_sep ||= ">, "
31
+ col_sep ||= " <"
32
+ end
33
+ end
34
+ return col_sep, rec_sep
35
+ end
36
+
37
+ def assign_headers(headers, headers_present)
38
+ headers = headers_present ? get_first_line : get_headers if headers.blank?
39
+ return headers, headers_present
40
+ end
41
+
42
+ def get_first_line
43
+ @data.split(get_record_separator).first.split(get_column_separator).map(&:to_sym)
44
+ end
45
+
46
+ def get_type
47
+ "copy_and_paste"
48
+ end
49
+
50
+ def get_headers
51
+ return @headers if @headers.present?
52
+ default_headers(100)
53
+ end
54
+
55
+ def get_preview_lines(start_point = @headers_present ? 1 : 0, end_point = 10)
56
+ begin
57
+ lines = clean_chunks([get_lines(start_point, end_point)], @compulsory_headers, @delete_empty_columns)[0][:lines]
58
+ if lines.first.nil?
59
+ get_preview_lines(start_point+10, end_point+10)
60
+ else
61
+ lines[0..7]
62
+ end
63
+ rescue StandardError
64
+ raise Exceptions::EmptyStringImportError.new
65
+ end
66
+ end
67
+
68
+ def get_lines(start_point, number_of_lines)
69
+ number_of_lines = number_of_lines - 1 if number_of_lines != -1 # -1 means return all lines.
70
+ mapped_lines = []
71
+ get_column_separator
72
+ @data.split(get_record_separator).each do |line|
73
+ split_line = line.split(@column_separator)
74
+ split_line = remove_whitespace(split_line)
75
+ mapped_lines << Hash[@headers.zip split_line]
76
+ end
77
+ mapped_lines[start_point..(start_point+number_of_lines)]
78
+ end
79
+
80
+ def remove_whitespace(column)
81
+ column.each do |column_item|
82
+ column_item.strip!
83
+ end
84
+ column
85
+ end
86
+
87
+ def get_chunks(chunk_size)
88
+ @headers = convert_headers(get_first_line, @headers, @headers_present)
89
+ lines = get_lines(0, -1).in_groups_of(chunk_size, false)
90
+ clean_chunks(lines, @compulsory_headers)
91
+ end
92
+
93
+ def convert_headers(provided_headers, mapped_headers, headers_present)
94
+ new_headers = headers_present ? provided_headers : default_headers
95
+ new_headers = default_headers(new_headers.count)
96
+ mapped_headers.each do |key, value|
97
+ if value.to_i.to_s == value
98
+ new_headers[value.to_i] = key.to_sym
99
+ end
100
+ end
101
+ new_headers
102
+ end
103
+
104
+ def get_column_separator(first_line = @data)
105
+ return @column_separator if !@column_separator.nil? && @column_separator.length > 0
106
+ separators = get_sep_count(first_line)
107
+ separators.reject!{ |sep| sep.keys[0] == @record_separator} if @record_separator != nil
108
+ @column_separator = sort_separators(separators)
109
+ end
110
+
111
+ def get_record_separator(first_line = @data)
112
+ return @record_separator if !@record_separator.nil? && @record_separator.length > 0
113
+ separators = get_sep_count(first_line)
114
+ separators.reject!{ |sep| sep.keys[0] == get_column_separator}
115
+ @record_separator = sort_separators(separators)
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,146 @@
1
+ # encoding: UTF-8
2
+
3
+ module TableImporter
4
+
5
+ class CSV < Source
6
+
7
+ def initialize(data)
8
+ @headers_present = data[:headers_present] # user has indicated headers are provided
9
+ @headers = data[:headers]
10
+ @column_separator = SEPARATORS[data[:column_separator].to_sym] if !data[:column_separator].nil?
11
+ @record_separator = !data[:record_separator].nil? && data[:record_separator].length > 0 ? SEPARATORS[data[:record_separator].to_sym] : "\n"
12
+ @compulsory_headers = data[:compulsory_headers]
13
+ @file = data[:content]
14
+ @delete_empty_columns = File.size(@file) < 100000
15
+ begin
16
+ first_line = get_first_line
17
+ if first_line == 0
18
+ raise ArgumentError
19
+ end
20
+ get_column_separator(first_line)
21
+ @preview_lines = file_has_no_content
22
+ @headers = @headers_present ? first_line.split(@column_separator) : default_headers(100) if @headers.blank?
23
+ rescue ArgumentError
24
+ @file = clean_file(@file)
25
+ retry
26
+ end
27
+ end
28
+
29
+ def get_first_line
30
+ begin
31
+ SmarterCSV.process(@file.path, default_options({:col_sep => @column_separator.present? ? @column_separator : "\n", :row_sep => @record_separator != nil ? @record_separator : "\n", :chunk_size => 8})) do |chunk|
32
+ if @headers_present
33
+ return chunk.first.keys[0].to_s
34
+ else
35
+ return chunk.first.values[0].to_s
36
+ end
37
+ end
38
+ rescue EOFError
39
+ raise Exceptions::EmptyFileImportError.new
40
+ end
41
+ end
42
+
43
+ def file_has_no_content
44
+ begin
45
+ lines = get_preview_lines
46
+ if lines.blank? || lines == 0
47
+ raise Exceptions::EmptyFileImportError.new
48
+ else
49
+ return lines
50
+ end
51
+ rescue NoMethodError
52
+ raise Exceptions::EmptyFileImportError.new
53
+ end
54
+ end
55
+
56
+ def get_type
57
+ "csv"
58
+ end
59
+
60
+ def get_headers
61
+ @headers
62
+ end
63
+
64
+ def get_column_separator(first_line = get_first_line)
65
+ return @column_separator if !@column_separator.nil? && @column_separator.length > 0
66
+ separators = get_sep_count(first_line)
67
+ separators.reject!{ |sep| sep.keys[0] == @record_separator} if @record_separator != nil
68
+ @column_separator = sort_separators(separators)
69
+ end
70
+
71
+ def get_record_separator(first_line = get_first_line)
72
+ return @record_separator if !@record_separator.nil? && @record_separator.length > 0
73
+ separators = get_sep_count(first_line)
74
+ separators.reject!{ |sep| sep.keys[0] == get_column_separator}
75
+ @record_separator = sort_separators(separators)
76
+ end
77
+
78
+ def get_preview_lines
79
+ begin
80
+ return clean_chunks([@preview_lines], @compulsory_headers, @delete_empty_columns)[0].symbolize_keys[:lines] if !@preview_lines.blank?
81
+ if @delete_empty_columns
82
+ chunks = SmarterCSV.process(@file.path, default_options({:row_sep => @record_separator != nil ? @record_separator : "\n", :chunk_size => 50}))
83
+ return clean_chunks(chunks, @compulsory_headers, true)[0].symbolize_keys[:lines][0..7]
84
+ end
85
+ SmarterCSV.process(@file.path, default_options({:row_sep => @record_separator != nil ? @record_separator : "\n", :chunk_size => 8})) do |chunk|
86
+ return clean_chunks([chunk], @compulsory_headers)[0].symbolize_keys[:lines][0..7]
87
+ end
88
+ rescue SmarterCSV::HeaderSizeMismatch
89
+ raise Exceptions::HeaderMismatchError.new
90
+ end
91
+ end
92
+
93
+ # this is horrendously slow
94
+ def get_lines(start, number_of_lines)
95
+ get_chunks(50)[start..(start + number_of_lines)]
96
+ end
97
+
98
+ def get_chunks(chunk_size)
99
+ begin
100
+ chunks = []
101
+ if @headers_present
102
+ key_mapping = convert_headers(SmarterCSV.process(@file.path, default_options).first.keys, @headers, @headers_present).delete_if{ |key, value| value.blank?}
103
+ chunks = SmarterCSV.process(@file.path, default_options({:chunk_size => chunk_size, :key_mapping => key_mapping, :remove_unmapped_keys => true, :user_provided_headers => nil}))
104
+ else
105
+ user_provided_headers = convert_headers(SmarterCSV.process(@file.path, default_options).first.keys, @headers, @headers_present).values
106
+ chunks = SmarterCSV.process(@file.path, default_options({:chunk_size => chunk_size, :user_provided_headers => user_provided_headers, :remove_empty_values => true}))
107
+ end
108
+ clean_chunks(chunks, @compulsory_headers, @delete_empty_columns)
109
+ rescue ArgumentError
110
+ @file = clean_file(@file)
111
+ retry
112
+ end
113
+ end
114
+
115
+ def convert_headers(provided_headers, mapped_headers, headers_present)
116
+ new_headers = []
117
+ old_headers = headers_present ? provided_headers : default_headers
118
+ old_headers.each_with_index do |key, index|
119
+ key_to_add = "column_#{index}".to_sym
120
+ mapped_headers.each do |new_key, value|
121
+ if value.to_s == index.to_s
122
+ key_to_add = new_key
123
+ end
124
+ end
125
+ new_headers << key_to_add
126
+ end
127
+ Hash[old_headers.zip(new_headers)]
128
+ end
129
+
130
+ # fix quote_char
131
+ # bit of a hack here to provide the correct number of default headers to the user (rather than just 100)
132
+ def default_options(options = {})
133
+ {:col_sep => @column_separator, :row_sep => @record_separator, :quote_char => "‱", :remove_empty_values => false,
134
+ :verbose => false, :headers_in_file => @headers_present, :convert_values_to_numeric => false,
135
+ :user_provided_headers => @headers_present ? (@headers == nil || @headers == {} ? nil : @headers) : default_headers(100)}.merge(options)
136
+ end
137
+
138
+ def clean_file(file)
139
+ contents = file.read
140
+ import = Tempfile.new(["import", ".xls"], :encoding => "UTF-8")
141
+ import.write(contents.force_encoding('UTF-8').encode('UTF-16', :invalid => :replace, :replace => '?').encode('UTF-8').gsub!(/\r\n|\r/, "\n"))
142
+ import.close
143
+ return import
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,92 @@
1
+ module TableImporter
2
+
3
+ class Excel < Source
4
+
5
+ def initialize(data)
6
+ begin
7
+ @type = File.extname(data[:content]) == ".xls" ? "xls" : "xlsx"
8
+ @file_path = data[:content].path
9
+ @headers_present = data[:headers_present]
10
+ @file = get_file
11
+ @compulsory_headers = data[:compulsory_headers]
12
+ @delete_empty_columns = (File.size(@file_path) < 100000)
13
+ @mapping = !data[:user_headers].blank? ? data[:user_headers] : data[:headers]
14
+ raise Exceptions::EmptyFileImportError.new if !@file.first_row
15
+ if !data[:headers].nil?
16
+ @headers = data[:headers]
17
+ else
18
+ @headers = @headers_present ? @file.row(1).map { |header| header.to_sym } : default_headers
19
+ end
20
+ rescue NoMethodError
21
+ raise Exceptions::HeaderMismatchError.new
22
+ end
23
+ end
24
+
25
+ def get_headers
26
+ @headers
27
+ end
28
+
29
+ def get_file
30
+ begin
31
+ if @type == "xls"
32
+ Roo::Excel.new(@file_path).sheet(0)
33
+ elsif @type == "xlsx"
34
+ Roo::Excelx.new(@file_path).sheet(0)
35
+ end
36
+ rescue TypeError
37
+ raise Exceptions::IncorrectFileError.new
38
+ end
39
+ end
40
+
41
+ def get_type
42
+ "xls"
43
+ end
44
+
45
+ def get_preview_lines(start_point = 0, end_point = 10)
46
+ begin
47
+ if clean_chunks([get_lines(start_point, end_point)], @compulsory_headers)[0][:lines].first.nil?
48
+ get_preview_lines(start_point+10, end_point+10)
49
+ else
50
+ @headers = @mapping.present? ? convert_headers : @headers
51
+ clean_chunks([get_lines(start_point+1, end_point+1)], @compulsory_headers, @delete_empty_columns)[0][:lines][0..7]
52
+ end
53
+ rescue SystemStackError
54
+ raise Exceptions::EmptyFileImportError.new
55
+ end
56
+ end
57
+
58
+ def get_lines(start, number_of_lines)
59
+ @last_row ||= @file.last_row
60
+ finish = [@last_row, start + number_of_lines].min
61
+ mapped_lines = []
62
+ (start...finish).each do |row_number|
63
+ mapped_lines << Hash[@headers.zip(@file.row(row_number))]
64
+ end
65
+ mapped_lines
66
+ end
67
+
68
+ def convert_headers
69
+ new_headers = @headers_present ? @file.row(1) : default_headers
70
+ new_headers = default_headers(new_headers.count)
71
+ @mapping.each do |key, value|
72
+ if value.to_i.to_s == value
73
+ new_headers[value.to_i] = key.to_sym
74
+ end
75
+ end
76
+ new_headers
77
+ end
78
+
79
+ def get_chunks(chunk_size)
80
+ @headers = convert_headers
81
+ @last_row ||= @file.last_row
82
+ chunks = []
83
+ start_point = @headers_present ? 2 : 1
84
+ while chunks.count <= @last_row/chunk_size
85
+ chunks << get_lines(start_point, chunk_size)
86
+ start_point += chunk_size
87
+ end
88
+ chunks.last << Hash[@headers.zip(@file.row(@last_row))]
89
+ clean_chunks(chunks, @compulsory_headers)
90
+ end
91
+ end
92
+ end