table_importer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.travis.yml +7 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE +21 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +9 -0
  8. data/Rakefile +2 -0
  9. data/lib/table_importer/copy_and_paste.rb +118 -0
  10. data/lib/table_importer/csv.rb +146 -0
  11. data/lib/table_importer/excel.rb +92 -0
  12. data/lib/table_importer/exceptions.rb +29 -0
  13. data/lib/table_importer/source.rb +149 -0
  14. data/lib/table_importer/version.rb +3 -0
  15. data/lib/table_importer.rb +3 -0
  16. data/spec/copy_and_paste_spec.rb +134 -0
  17. data/spec/csv_spec.rb +135 -0
  18. data/spec/excel_spec.rb +139 -0
  19. data/spec/files/csv/10-1359651839-google_(1).csv +0 -0
  20. data/spec/files/csv/11-1359651879-contacts_(1) (1).csv +158 -0
  21. data/spec/files/csv/11-1359651879-contacts_(1).csv +158 -0
  22. data/spec/files/csv/6-1359649307-contacts (1).csv +157 -0
  23. data/spec/files/csv/6-1359649307-contacts (2).csv +158 -0
  24. data/spec/files/csv/6-1359649307-contacts (3).csv +158 -0
  25. data/spec/files/csv/6-1359649307-contacts.csv +158 -0
  26. data/spec/files/csv/7-1359650836-6-1359649307-contacts.csv +158 -0
  27. data/spec/files/csv/8-1359651745-contacts.csv +158 -0
  28. data/spec/files/csv/9-1359651826-google_(1).csv +0 -0
  29. data/spec/files/csv/bad_headers_2.csv +45 -0
  30. data/spec/files/csv/csv_headers.csv +55 -0
  31. data/spec/files/csv/csv_no_headers.csv +5 -0
  32. data/spec/files/csv/edge_cases.csv +16 -0
  33. data/spec/files/csv/hong_kong.csv +1150 -0
  34. data/spec/files/csv/hong_kong_no_headers.csv +9 -0
  35. data/spec/files/csv/hong_kong_small.csv +10 -0
  36. data/spec/files/csv/mexico2013_pressdoc.csv +3248 -0
  37. data/spec/files/csv/no_content.csv +22 -0
  38. data/spec/files/csv/semicolon.csv +214 -0
  39. data/spec/files/csv/with_headers.csv +10 -0
  40. data/spec/files/csv/with_headers_large.csv +10760 -0
  41. data/spec/files/csv/without_headers.csv +9 -0
  42. data/spec/files/excel/edge_cases.xls +0 -0
  43. data/spec/files/excel/no_content.xls +0 -0
  44. data/spec/files/excel/no_content.xlsx +0 -0
  45. data/spec/files/excel/with_headers.xls +0 -0
  46. data/spec/files/excel/with_headers_large.xls +0 -0
  47. data/spec/files/excel/with_headers_large.xlsx +0 -0
  48. data/spec/files/excel/without_headers.xls +0 -0
  49. data/spec/spec_helper.rb +20 -0
  50. data/table_importer.gemspec +32 -0
  51. data/tasks/rspec.rake +4 -0
  52. metadata +254 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a17aaa63e9aa58fb38435c0b2a205bbbfa63af1e
4
+ data.tar.gz: bd75fd7ac865182feb9d0ff6ab7c0d4e18af35a2
5
+ SHA512:
6
+ metadata.gz: 19aca54d1033e6bb488482f5a8506b902ef6f73bfd80f1539102710b87ef506fb319ab9252daa271e37635ccf26127c0a4a3c1336ec8f5b211f32357c7a5e6fc
7
+ data.tar.gz: 27b5e4f763ee14972c3fddb2f69ea84d111a097069ab5605bf50df40f55120480bef0ecbcc42147880ffd073470e4d9fbec06d9d4129b87dbb76b9e06efbe918
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+
data/.travis.yml ADDED
@@ -0,0 +1,7 @@
1
+ language: ruby
2
+ rvm:
3
+ - "1.9.3"
4
+ - "2.0.0"
5
+ - "2.1.1"
6
+ - "2.1.2"
7
+ - jruby-19mode # JRuby in 1.9 mode
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in table_importer.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 pr.co
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,9 @@
1
+ [![Build Status](https://travis-ci.org/pressdoc/table_importer.svg?branch=master)](https://travis-ci.org/pressdoc/table_importer)[![Coverage Status](https://coveralls.io/repos/pressdoc/table_importer/badge.png?branch=master)](https://coveralls.io/r/pressdoc/table_importer?branch=master)
2
+
3
+ ==============
4
+ Table Importer
5
+ ==============
6
+
7
+ Given a file (or a string) containing a container, along with options, it will return a hash of those values. Great for importing poorly formatted CSV files.
8
+
9
+ Only works for ruby versions >= 1.9.3.
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ Dir.glob('tasks/**/*.rake').each(&method(:import))
@@ -0,0 +1,118 @@
1
+ module TableImporter
2
+
3
+ class CopyAndPaste < Source
4
+
5
+ def initialize(data)
6
+ @data = assign_data(data[:content])
7
+ @column_separator, @record_separator = assign_separators(data[:col_sep], data[:rec_sep])
8
+ @headers, @headers_present = assign_headers(data[:headers], data[:headers_present])
9
+ @compulsory_headers = data[:compulsory_headers]
10
+ @delete_empty_columns = @data.length < 50000
11
+ end
12
+
13
+ def assign_data(content)
14
+ raise Exceptions::EmptyFileImportError.new if content.blank? || content[0..100].gsub(/[^A-Za-z0-9]/, '').blank?
15
+ content.gsub!(/\r\n|\r/, "\n")
16
+ return content
17
+ end
18
+
19
+ def assign_separators(col_sep, rec_sep)
20
+ col_sep = SEPARATORS[col_sep.to_sym] if !col_sep.nil?
21
+ rec_sep = SEPARATORS[rec_sep.to_sym] if !rec_sep.nil?
22
+ col_sep, rec_sep = data_conforms_pattern(col_sep, rec_sep)
23
+ end
24
+
25
+ def data_conforms_pattern(col_sep, rec_sep)
26
+ # Check to see if data is of bcc style
27
+ first_item = @data.split(",").first
28
+ if first_item.present? && first_item.match(/\S@\S/)
29
+ if first_item.match(/<(\S+@\S+)/)
30
+ rec_sep ||= ">, "
31
+ col_sep ||= " <"
32
+ end
33
+ end
34
+ return col_sep, rec_sep
35
+ end
36
+
37
+ def assign_headers(headers, headers_present)
38
+ headers = headers_present ? get_first_line : get_headers if headers.blank?
39
+ return headers, headers_present
40
+ end
41
+
42
+ def get_first_line
43
+ @data.split(get_record_separator).first.split(get_column_separator).map(&:to_sym)
44
+ end
45
+
46
+ def get_type
47
+ "copy_and_paste"
48
+ end
49
+
50
+ def get_headers
51
+ return @headers if @headers.present?
52
+ default_headers(100)
53
+ end
54
+
55
+ def get_preview_lines(start_point = @headers_present ? 1 : 0, end_point = 10)
56
+ begin
57
+ lines = clean_chunks([get_lines(start_point, end_point)], @compulsory_headers, @delete_empty_columns)[0][:lines]
58
+ if lines.first.nil?
59
+ get_preview_lines(start_point+10, end_point+10)
60
+ else
61
+ lines[0..7]
62
+ end
63
+ rescue StandardError
64
+ raise Exceptions::EmptyStringImportError.new
65
+ end
66
+ end
67
+
68
+ def get_lines(start_point, number_of_lines)
69
+ number_of_lines = number_of_lines - 1 if number_of_lines != -1 # -1 means return all lines.
70
+ mapped_lines = []
71
+ get_column_separator
72
+ @data.split(get_record_separator).each do |line|
73
+ split_line = line.split(@column_separator)
74
+ split_line = remove_whitespace(split_line)
75
+ mapped_lines << Hash[@headers.zip split_line]
76
+ end
77
+ mapped_lines[start_point..(start_point+number_of_lines)]
78
+ end
79
+
80
+ def remove_whitespace(column)
81
+ column.each do |column_item|
82
+ column_item.strip!
83
+ end
84
+ column
85
+ end
86
+
87
+ def get_chunks(chunk_size)
88
+ @headers = convert_headers(get_first_line, @headers, @headers_present)
89
+ lines = get_lines(0, -1).in_groups_of(chunk_size, false)
90
+ clean_chunks(lines, @compulsory_headers)
91
+ end
92
+
93
+ def convert_headers(provided_headers, mapped_headers, headers_present)
94
+ new_headers = headers_present ? provided_headers : default_headers
95
+ new_headers = default_headers(new_headers.count)
96
+ mapped_headers.each do |key, value|
97
+ if value.to_i.to_s == value
98
+ new_headers[value.to_i] = key.to_sym
99
+ end
100
+ end
101
+ new_headers
102
+ end
103
+
104
+ def get_column_separator(first_line = @data)
105
+ return @column_separator if !@column_separator.nil? && @column_separator.length > 0
106
+ separators = get_sep_count(first_line)
107
+ separators.reject!{ |sep| sep.keys[0] == @record_separator} if @record_separator != nil
108
+ @column_separator = sort_separators(separators)
109
+ end
110
+
111
+ def get_record_separator(first_line = @data)
112
+ return @record_separator if !@record_separator.nil? && @record_separator.length > 0
113
+ separators = get_sep_count(first_line)
114
+ separators.reject!{ |sep| sep.keys[0] == get_column_separator}
115
+ @record_separator = sort_separators(separators)
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,146 @@
1
+ # encoding: UTF-8
2
+
3
+ module TableImporter
4
+
5
+ class CSV < Source
6
+
7
+ def initialize(data)
8
+ @headers_present = data[:headers_present] # user has indicated headers are provided
9
+ @headers = data[:headers]
10
+ @column_separator = SEPARATORS[data[:column_separator].to_sym] if !data[:column_separator].nil?
11
+ @record_separator = !data[:record_separator].nil? && data[:record_separator].length > 0 ? SEPARATORS[data[:record_separator].to_sym] : "\n"
12
+ @compulsory_headers = data[:compulsory_headers]
13
+ @file = data[:content]
14
+ @delete_empty_columns = File.size(@file) < 100000
15
+ begin
16
+ first_line = get_first_line
17
+ if first_line == 0
18
+ raise ArgumentError
19
+ end
20
+ get_column_separator(first_line)
21
+ @preview_lines = file_has_no_content
22
+ @headers = @headers_present ? first_line.split(@column_separator) : default_headers(100) if @headers.blank?
23
+ rescue ArgumentError
24
+ @file = clean_file(@file)
25
+ retry
26
+ end
27
+ end
28
+
29
+ def get_first_line
30
+ begin
31
+ SmarterCSV.process(@file.path, default_options({:col_sep => @column_separator.present? ? @column_separator : "\n", :row_sep => @record_separator != nil ? @record_separator : "\n", :chunk_size => 8})) do |chunk|
32
+ if @headers_present
33
+ return chunk.first.keys[0].to_s
34
+ else
35
+ return chunk.first.values[0].to_s
36
+ end
37
+ end
38
+ rescue EOFError
39
+ raise Exceptions::EmptyFileImportError.new
40
+ end
41
+ end
42
+
43
+ def file_has_no_content
44
+ begin
45
+ lines = get_preview_lines
46
+ if lines.blank? || lines == 0
47
+ raise Exceptions::EmptyFileImportError.new
48
+ else
49
+ return lines
50
+ end
51
+ rescue NoMethodError
52
+ raise Exceptions::EmptyFileImportError.new
53
+ end
54
+ end
55
+
56
+ def get_type
57
+ "csv"
58
+ end
59
+
60
+ def get_headers
61
+ @headers
62
+ end
63
+
64
+ def get_column_separator(first_line = get_first_line)
65
+ return @column_separator if !@column_separator.nil? && @column_separator.length > 0
66
+ separators = get_sep_count(first_line)
67
+ separators.reject!{ |sep| sep.keys[0] == @record_separator} if @record_separator != nil
68
+ @column_separator = sort_separators(separators)
69
+ end
70
+
71
+ def get_record_separator(first_line = get_first_line)
72
+ return @record_separator if !@record_separator.nil? && @record_separator.length > 0
73
+ separators = get_sep_count(first_line)
74
+ separators.reject!{ |sep| sep.keys[0] == get_column_separator}
75
+ @record_separator = sort_separators(separators)
76
+ end
77
+
78
+ def get_preview_lines
79
+ begin
80
+ return clean_chunks([@preview_lines], @compulsory_headers, @delete_empty_columns)[0].symbolize_keys[:lines] if !@preview_lines.blank?
81
+ if @delete_empty_columns
82
+ chunks = SmarterCSV.process(@file.path, default_options({:row_sep => @record_separator != nil ? @record_separator : "\n", :chunk_size => 50}))
83
+ return clean_chunks(chunks, @compulsory_headers, true)[0].symbolize_keys[:lines][0..7]
84
+ end
85
+ SmarterCSV.process(@file.path, default_options({:row_sep => @record_separator != nil ? @record_separator : "\n", :chunk_size => 8})) do |chunk|
86
+ return clean_chunks([chunk], @compulsory_headers)[0].symbolize_keys[:lines][0..7]
87
+ end
88
+ rescue SmarterCSV::HeaderSizeMismatch
89
+ raise Exceptions::HeaderMismatchError.new
90
+ end
91
+ end
92
+
93
+ # this is horrendously slow
94
+ def get_lines(start, number_of_lines)
95
+ get_chunks(50)[start..(start + number_of_lines)]
96
+ end
97
+
98
+ def get_chunks(chunk_size)
99
+ begin
100
+ chunks = []
101
+ if @headers_present
102
+ key_mapping = convert_headers(SmarterCSV.process(@file.path, default_options).first.keys, @headers, @headers_present).delete_if{ |key, value| value.blank?}
103
+ chunks = SmarterCSV.process(@file.path, default_options({:chunk_size => chunk_size, :key_mapping => key_mapping, :remove_unmapped_keys => true, :user_provided_headers => nil}))
104
+ else
105
+ user_provided_headers = convert_headers(SmarterCSV.process(@file.path, default_options).first.keys, @headers, @headers_present).values
106
+ chunks = SmarterCSV.process(@file.path, default_options({:chunk_size => chunk_size, :user_provided_headers => user_provided_headers, :remove_empty_values => true}))
107
+ end
108
+ clean_chunks(chunks, @compulsory_headers, @delete_empty_columns)
109
+ rescue ArgumentError
110
+ @file = clean_file(@file)
111
+ retry
112
+ end
113
+ end
114
+
115
+ def convert_headers(provided_headers, mapped_headers, headers_present)
116
+ new_headers = []
117
+ old_headers = headers_present ? provided_headers : default_headers
118
+ old_headers.each_with_index do |key, index|
119
+ key_to_add = "column_#{index}".to_sym
120
+ mapped_headers.each do |new_key, value|
121
+ if value.to_s == index.to_s
122
+ key_to_add = new_key
123
+ end
124
+ end
125
+ new_headers << key_to_add
126
+ end
127
+ Hash[old_headers.zip(new_headers)]
128
+ end
129
+
130
+ # fix quote_char
131
+ # bit of a hack here to provide the correct number of default headers to the user (rather than just 100)
132
+ def default_options(options = {})
133
+ {:col_sep => @column_separator, :row_sep => @record_separator, :quote_char => "‱", :remove_empty_values => false,
134
+ :verbose => false, :headers_in_file => @headers_present, :convert_values_to_numeric => false,
135
+ :user_provided_headers => @headers_present ? (@headers == nil || @headers == {} ? nil : @headers) : default_headers(100)}.merge(options)
136
+ end
137
+
138
+ def clean_file(file)
139
+ contents = file.read
140
+ import = Tempfile.new(["import", ".xls"], :encoding => "UTF-8")
141
+ import.write(contents.force_encoding('UTF-8').encode('UTF-16', :invalid => :replace, :replace => '?').encode('UTF-8').gsub!(/\r\n|\r/, "\n"))
142
+ import.close
143
+ return import
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,92 @@
1
+ module TableImporter
2
+
3
+ class Excel < Source
4
+
5
+ def initialize(data)
6
+ begin
7
+ @type = File.extname(data[:content]) == ".xls" ? "xls" : "xlsx"
8
+ @file_path = data[:content].path
9
+ @headers_present = data[:headers_present]
10
+ @file = get_file
11
+ @compulsory_headers = data[:compulsory_headers]
12
+ @delete_empty_columns = (File.size(@file_path) < 100000)
13
+ @mapping = !data[:user_headers].blank? ? data[:user_headers] : data[:headers]
14
+ raise Exceptions::EmptyFileImportError.new if !@file.first_row
15
+ if !data[:headers].nil?
16
+ @headers = data[:headers]
17
+ else
18
+ @headers = @headers_present ? @file.row(1).map { |header| header.to_sym } : default_headers
19
+ end
20
+ rescue NoMethodError
21
+ raise Exceptions::HeaderMismatchError.new
22
+ end
23
+ end
24
+
25
+ def get_headers
26
+ @headers
27
+ end
28
+
29
+ def get_file
30
+ begin
31
+ if @type == "xls"
32
+ Roo::Excel.new(@file_path).sheet(0)
33
+ elsif @type == "xlsx"
34
+ Roo::Excelx.new(@file_path).sheet(0)
35
+ end
36
+ rescue TypeError
37
+ raise Exceptions::IncorrectFileError.new
38
+ end
39
+ end
40
+
41
+ def get_type
42
+ "xls"
43
+ end
44
+
45
+ def get_preview_lines(start_point = 0, end_point = 10)
46
+ begin
47
+ if clean_chunks([get_lines(start_point, end_point)], @compulsory_headers)[0][:lines].first.nil?
48
+ get_preview_lines(start_point+10, end_point+10)
49
+ else
50
+ @headers = @mapping.present? ? convert_headers : @headers
51
+ clean_chunks([get_lines(start_point+1, end_point+1)], @compulsory_headers, @delete_empty_columns)[0][:lines][0..7]
52
+ end
53
+ rescue SystemStackError
54
+ raise Exceptions::EmptyFileImportError.new
55
+ end
56
+ end
57
+
58
+ def get_lines(start, number_of_lines)
59
+ @last_row ||= @file.last_row
60
+ finish = [@last_row, start + number_of_lines].min
61
+ mapped_lines = []
62
+ (start...finish).each do |row_number|
63
+ mapped_lines << Hash[@headers.zip(@file.row(row_number))]
64
+ end
65
+ mapped_lines
66
+ end
67
+
68
+ def convert_headers
69
+ new_headers = @headers_present ? @file.row(1) : default_headers
70
+ new_headers = default_headers(new_headers.count)
71
+ @mapping.each do |key, value|
72
+ if value.to_i.to_s == value
73
+ new_headers[value.to_i] = key.to_sym
74
+ end
75
+ end
76
+ new_headers
77
+ end
78
+
79
+ def get_chunks(chunk_size)
80
+ @headers = convert_headers
81
+ @last_row ||= @file.last_row
82
+ chunks = []
83
+ start_point = @headers_present ? 2 : 1
84
+ while chunks.count <= @last_row/chunk_size
85
+ chunks << get_lines(start_point, chunk_size)
86
+ start_point += chunk_size
87
+ end
88
+ chunks.last << Hash[@headers.zip(@file.row(@last_row))]
89
+ clean_chunks(chunks, @compulsory_headers)
90
+ end
91
+ end
92
+ end