ndr_import 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +14 -0
- data/.rubocop.yml +27 -0
- data/.ruby-version +1 -0
- data/.travis.yml +22 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +4 -0
- data/Guardfile +16 -0
- data/LICENSE.txt +21 -0
- data/README.md +69 -0
- data/Rakefile +13 -0
- data/code_safety.yml +374 -0
- data/gemfiles/Gemfile.rails32 +5 -0
- data/gemfiles/Gemfile.rails32.lock +142 -0
- data/gemfiles/Gemfile.rails41 +5 -0
- data/gemfiles/Gemfile.rails41.lock +145 -0
- data/gemfiles/Gemfile.rails42 +5 -0
- data/gemfiles/Gemfile.rails42.lock +145 -0
- data/lib/ndr_import.rb +13 -0
- data/lib/ndr_import/csv_library.rb +40 -0
- data/lib/ndr_import/file/all.rb +8 -0
- data/lib/ndr_import/file/base.rb +76 -0
- data/lib/ndr_import/file/delimited.rb +86 -0
- data/lib/ndr_import/file/excel.rb +131 -0
- data/lib/ndr_import/file/pdf.rb +38 -0
- data/lib/ndr_import/file/registry.rb +50 -0
- data/lib/ndr_import/file/text.rb +52 -0
- data/lib/ndr_import/file/word.rb +30 -0
- data/lib/ndr_import/file/zip.rb +67 -0
- data/lib/ndr_import/helpers/file/delimited.rb +105 -0
- data/lib/ndr_import/helpers/file/excel.rb +181 -0
- data/lib/ndr_import/helpers/file/pdf.rb +29 -0
- data/lib/ndr_import/helpers/file/word.rb +27 -0
- data/lib/ndr_import/helpers/file/xml.rb +45 -0
- data/lib/ndr_import/helpers/file/zip.rb +44 -0
- data/lib/ndr_import/mapper.rb +220 -0
- data/lib/ndr_import/mapping_error.rb +5 -0
- data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
- data/lib/ndr_import/non_tabular/line.rb +46 -0
- data/lib/ndr_import/non_tabular/mapping.rb +35 -0
- data/lib/ndr_import/non_tabular/record.rb +99 -0
- data/lib/ndr_import/non_tabular/table.rb +193 -0
- data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
- data/lib/ndr_import/standard_mappings.rb +23 -0
- data/lib/ndr_import/table.rb +179 -0
- data/lib/ndr_import/version.rb +4 -0
- data/ndr_import.gemspec +44 -0
- data/test/file/base_test.rb +54 -0
- data/test/file/delimited_test.rb +143 -0
- data/test/file/excel_test.rb +85 -0
- data/test/file/pdf_test.rb +35 -0
- data/test/file/registry_test.rb +60 -0
- data/test/file/text_test.rb +92 -0
- data/test/file/word_test.rb +35 -0
- data/test/file/zip_test.rb +47 -0
- data/test/helpers/file/delimited_test.rb +113 -0
- data/test/helpers/file/excel_test.rb +97 -0
- data/test/helpers/file/pdf_test.rb +26 -0
- data/test/helpers/file/word_test.rb +26 -0
- data/test/helpers/file/xml_test.rb +131 -0
- data/test/helpers/file/zip_test.rb +75 -0
- data/test/mapper_test.rb +551 -0
- data/test/non_tabular/mapping_test.rb +36 -0
- data/test/non_tabular/table_test.rb +510 -0
- data/test/non_tabular_file_helper_test.rb +501 -0
- data/test/readme_test.rb +53 -0
- data/test/resources/bomd.csv +3 -0
- data/test/resources/broken.csv +3 -0
- data/test/resources/filesystem_paths.yml +26 -0
- data/test/resources/flat_file.pdf +0 -0
- data/test/resources/flat_file.txt +27 -0
- data/test/resources/flat_file.yml +20 -0
- data/test/resources/hello_utf16be.txt +0 -0
- data/test/resources/hello_utf16le.txt +0 -0
- data/test/resources/hello_utf8.txt +2 -0
- data/test/resources/hello_windows.txt +2 -0
- data/test/resources/hello_world.doc +0 -0
- data/test/resources/hello_world.pdf +0 -0
- data/test/resources/hello_world.txt +2 -0
- data/test/resources/high_ascii_delimited.txt +2 -0
- data/test/resources/malformed.xml +6 -0
- data/test/resources/normal.csv +3 -0
- data/test/resources/normal.csv.zip +0 -0
- data/test/resources/normal_pipe.csv +3 -0
- data/test/resources/normal_thorn.csv +3 -0
- data/test/resources/not_a_pdf.pdf +0 -0
- data/test/resources/not_a_word_file.doc +0 -0
- data/test/resources/sample_xls.xls +0 -0
- data/test/resources/sample_xlsx.xlsx +0 -0
- data/test/resources/standard_mappings.yml +39 -0
- data/test/resources/txt_file_xls_extension.xls +1 -0
- data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
- data/test/resources/utf-16be_xml.xml +0 -0
- data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
- data/test/resources/utf-16le_xml.xml +0 -0
- data/test/resources/utf-8_xml.xml +9 -0
- data/test/resources/windows-1252_xml.xml +9 -0
- data/test/resources/windows.csv +5 -0
- data/test/resources/xlsx_file_xls_extension.xls +0 -0
- data/test/standard_mappings_test.rb +22 -0
- data/test/table_test.rb +288 -0
- data/test/test_helper.rb +13 -0
- metadata +443 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'ndr_support/safe_file'
|
2
|
+
require 'ndr_support/utf8_encoding'
|
3
|
+
require_relative 'registry'
|
4
|
+
|
5
|
+
module NdrImport
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
8
|
+
module File
|
9
|
+
# This class is a text file handler that returns a single table.
|
10
|
+
class Text < Base
|
11
|
+
include UTF8Encoding
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def rows(&block)
|
16
|
+
return enum_for(:rows) unless block
|
17
|
+
|
18
|
+
# Encoding:
|
19
|
+
# As we're going to be yielding the lines of the file as it is streamed
|
20
|
+
# (rather than slurped in advance), we need to know which encoding / mode
|
21
|
+
# is going to work in advance.
|
22
|
+
#
|
23
|
+
path = SafeFile.safepath_to_string(@filename)
|
24
|
+
mode = read_mode_for(path)
|
25
|
+
|
26
|
+
# SECURE: TG 13 Oct 2015 SafeFile.safepath_to_string ensures that the path is SafePath.
|
27
|
+
::File.new(path, mode).each { |line| block.call ensure_utf8!(line).chomp }
|
28
|
+
rescue => e
|
29
|
+
raise "Failed to read #{SafeFile.basename(@filename)} as text [#{e.class}: #{e.message}]"
|
30
|
+
end
|
31
|
+
|
32
|
+
# TODO: In Ruby 2.0+, a mode of "rb:bom|utf-16:utf-8" seemed to fix all cases,
|
33
|
+
# but this doesn't work on Ruby 1.9.3, which we are currently still supporting.
|
34
|
+
# Therefore, we have to test multiple modes in advance, hence #read_mode_for.
|
35
|
+
def read_mode_for(trusted_path)
|
36
|
+
# These are the read modes we will try, in order:
|
37
|
+
modes = ['rb:utf-16:utf-8', 'r:utf-8']
|
38
|
+
|
39
|
+
begin
|
40
|
+
::File.new(trusted_path, modes.first).each { |_line| }
|
41
|
+
rescue Encoding::InvalidByteSequenceError
|
42
|
+
modes.shift # That one didn't work...
|
43
|
+
retry if modes.any?
|
44
|
+
end
|
45
|
+
|
46
|
+
modes.first || fail('Unable to determine working stream encoding!')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
Registry.register(Text, 'txt') # TODO: Add 'nontabular'?
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'msworddoc-extractor'
|
2
|
+
require 'ndr_support/safe_file'
|
3
|
+
require_relative 'registry'
|
4
|
+
|
5
|
+
module NdrImport
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
8
|
+
module File
|
9
|
+
# This class is a Word document file handler that returns a single table.
|
10
|
+
# currently only works on .doc (97-2003), not.docx
|
11
|
+
class Word < Base
|
12
|
+
private
|
13
|
+
|
14
|
+
def rows(&block)
|
15
|
+
return enum_for(:rows) unless block
|
16
|
+
|
17
|
+
doc = MSWordDoc::Extractor.load(SafeFile.safepath_to_string(@filename))
|
18
|
+
|
19
|
+
doc.whole_contents.split("\n").each do |line|
|
20
|
+
block.call(line)
|
21
|
+
end
|
22
|
+
|
23
|
+
rescue => e
|
24
|
+
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
Registry.register(Word, 'doc') # TODO: Add 'word'?
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'zip'
|
2
|
+
require 'ndr_support/safe_file'
|
3
|
+
require_relative 'registry'
|
4
|
+
|
5
|
+
module NdrImport
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
8
|
+
module File
|
9
|
+
# This class is a zip file handler that returns tables from the extracted files.
|
10
|
+
class Zip < Base
|
11
|
+
def initialize(filename, format, options = {})
|
12
|
+
super
|
13
|
+
@pattern = options['pattern'] || //
|
14
|
+
@unzip_path = options['unzip_path']
|
15
|
+
|
16
|
+
validate_unzip_path_is_safe!
|
17
|
+
end
|
18
|
+
|
19
|
+
def files(&block)
|
20
|
+
fail 'Not allowed in external environment' if defined?(::Rails) && ::Rails.env.external?
|
21
|
+
|
22
|
+
return enum_for(:files) unless block
|
23
|
+
|
24
|
+
destination = @unzip_path.join(Time.current.strftime('%H%M%S%L'))
|
25
|
+
FileUtils.mkdir_p(SafeFile.safepath_to_string(destination))
|
26
|
+
|
27
|
+
::Zip::File.open(SafeFile.safepath_to_string(@filename)) do |zipfile|
|
28
|
+
unzip_entries(zipfile, destination, &block)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Zip files produce files, never tables.
|
33
|
+
def tables
|
34
|
+
fail 'Zip#tables should never be called'
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
# Unzip the zip file entry and enumerate over it
|
40
|
+
def unzip_entries(zipfile, destination, &block)
|
41
|
+
zipfile.entries.each do |entry|
|
42
|
+
# SECURE: TPG 2010-11-1: The path is stripped from the zipfile entry when extracted
|
43
|
+
basename = ::File.basename(entry.name)
|
44
|
+
next unless entry.file? && basename.match(@pattern)
|
45
|
+
|
46
|
+
unzipped_filename = destination.join(basename)
|
47
|
+
zipfile.extract(entry, unzipped_filename)
|
48
|
+
|
49
|
+
unzipped_files(unzipped_filename, &block)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Enumerate over an unzipped file like any other
|
54
|
+
def unzipped_files(unzipped_filename, &block)
|
55
|
+
Registry.files(unzipped_filename, @options).each do |filename|
|
56
|
+
block.call(filename)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def validate_unzip_path_is_safe!
|
61
|
+
SafeFile.safepath_to_string(@unzip_path)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
Registry.register(Zip, 'zip')
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'ndr_support/safe_file'
|
2
|
+
require 'ndr_import/csv_library'
|
3
|
+
|
4
|
+
module NdrImport
|
5
|
+
module Helpers
|
6
|
+
module File
|
7
|
+
# This mixin adds delimited file functionality to unified importers.
|
8
|
+
module Delimited
|
9
|
+
# Read a plain text CSV file, return an array of the content
|
10
|
+
def read_csv_file(path)
|
11
|
+
# Read the page below when encountering "CSV::IllegalFormatError" error caused by CSV
|
12
|
+
# file generated at MAC OS
|
13
|
+
# http://stackoverflow.com/questions/1549139/ruby-cannot-parse-excel-file-exported-as-csv-in-os-x
|
14
|
+
|
15
|
+
read_delimited_file(path)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Slurp the entire file into an array of lines.
|
19
|
+
def read_delimited_file(path, col_sep = nil)
|
20
|
+
delimited_rows(path, col_sep).to_a
|
21
|
+
end
|
22
|
+
|
23
|
+
# Iterate through the file table by table, yielding each one in turn.
|
24
|
+
def delimited_tables(path, col_sep = nil)
|
25
|
+
return enum_for(:delimited_tables, path, col_sep) unless block_given?
|
26
|
+
|
27
|
+
yield nil, delimited_rows(path, col_sep)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Deprecated method
|
31
|
+
def each_delimited_table(path, col_sep = nil, &block)
|
32
|
+
Kernel.warn '[warning] each_delimited_table will be deprecated,' \
|
33
|
+
' please use delimited_tables instead.'
|
34
|
+
delimited_tables(path, col_sep, &block)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Iterate through the file line by line, yielding each one in turn.
|
38
|
+
def delimited_rows(path, col_sep = nil)
|
39
|
+
return enum_for(:delimited_rows, path, col_sep) unless block_given?
|
40
|
+
|
41
|
+
safe_path = SafeFile.safepath_to_string(path)
|
42
|
+
encodings = determine_encodings!(safe_path, col_sep)
|
43
|
+
|
44
|
+
# By now, we know `encodings` should let us read the whole
|
45
|
+
# file succesfully; if there are problems, we should crash.
|
46
|
+
CSVLibrary.foreach(safe_path, encodings) do |line|
|
47
|
+
yield line.map(&:to_s) unless line.length <= 5
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Deprecated method
|
52
|
+
def each_delimited_row(path, col_sep = nil, &block)
|
53
|
+
Kernel.warn '[warning] each_delimited_row will be deprecated,' \
|
54
|
+
' please use delimited_rows instead.'
|
55
|
+
delimited_rows(path, col_sep, &block)
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
# Derive the source encoding by trying all supported encodings.
|
61
|
+
# Returns first set of working options, or raises if none could be found.
|
62
|
+
def determine_encodings!(safe_path, col_sep = nil)
|
63
|
+
# delimiter encoding => # FasterCSV encoding string
|
64
|
+
supported_encodings = {
|
65
|
+
'UTF-8' => 'bom|utf-8',
|
66
|
+
'Windows-1252' => 'windows-1252:utf-8'
|
67
|
+
}
|
68
|
+
|
69
|
+
successful_options = nil
|
70
|
+
supported_encodings.each do |delimiter_encoding, csv_encoding|
|
71
|
+
begin
|
72
|
+
options = {
|
73
|
+
:col_sep => (col_sep || ',').force_encoding(delimiter_encoding),
|
74
|
+
:encoding => csv_encoding
|
75
|
+
}
|
76
|
+
|
77
|
+
row_num = 0
|
78
|
+
# Iterate through the file; if we reach the end, this encoding worked:
|
79
|
+
CSVLibrary.foreach(safe_path, options) { |_line| row_num += 1 }
|
80
|
+
rescue ArgumentError => e
|
81
|
+
next if e.message =~ /invalid byte sequence/ # This encoding didn't work
|
82
|
+
raise(e)
|
83
|
+
rescue CSVLibrary::MalformedCSVError => e
|
84
|
+
description = (col_sep ? col_sep.inspect + ' delimited' : 'CSV')
|
85
|
+
|
86
|
+
raise(CSVLibrary::MalformedCSVError, "Invalid #{description} format " \
|
87
|
+
"on row #{row_num + 1} of #{::File.basename(safe_path)}. Original: #{e.message}")
|
88
|
+
end
|
89
|
+
|
90
|
+
# We got this far => encoding choice worked:
|
91
|
+
successful_options = options
|
92
|
+
break
|
93
|
+
end
|
94
|
+
|
95
|
+
# We tried them all, and none worked:
|
96
|
+
unless successful_options
|
97
|
+
fail "None of the encodings #{supported_encodings.values.inspect} were successful!"
|
98
|
+
end
|
99
|
+
|
100
|
+
successful_options
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,181 @@
|
|
1
|
+
require 'ndr_support/safe_file'
|
2
|
+
|
3
|
+
module NdrImport
|
4
|
+
module Helpers
|
5
|
+
module File
|
6
|
+
# This mixin adds excel spreadsheet functionality to unified importers.
|
7
|
+
# It provides a file reader method and methods to cast raw values
|
8
|
+
# appropriately. These methods can be overridden or aliased as required.
|
9
|
+
#
|
10
|
+
module Excel
|
11
|
+
require 'roo'
|
12
|
+
require 'roo-xls'
|
13
|
+
require 'ole/storage'
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def cast_excel_value(raw_value)
|
18
|
+
return raw_value if raw_value.nil?
|
19
|
+
|
20
|
+
if raw_value.is_a?(Date) || raw_value.is_a?(DateTime) || raw_value.is_a?(Time)
|
21
|
+
cast_excel_datetime_as_date(raw_value)
|
22
|
+
elsif raw_value.is_a?(Float)
|
23
|
+
if raw_value.to_f == raw_value.to_i
|
24
|
+
# Whole number
|
25
|
+
return raw_value.to_i.to_s
|
26
|
+
else
|
27
|
+
return raw_value.to_f.to_s
|
28
|
+
end
|
29
|
+
else
|
30
|
+
return raw_value.to_s.strip
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def cast_excel_datetime_as_date(raw_value)
|
35
|
+
raw_value.to_s(:db)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Iterate through the file table by table, yielding each one in turn.
|
39
|
+
def excel_tables(path)
|
40
|
+
return enum_for(:excel_tables, path) unless block_given?
|
41
|
+
|
42
|
+
workbook = load_workbook(path)
|
43
|
+
workbook.each_with_pagename do |tablename, sheet|
|
44
|
+
yield tablename, excel_rows(workbook, sheet)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Deprecated method
|
49
|
+
def each_excel_table(path, &block)
|
50
|
+
Kernel.warn '[warning] each_excel_table will be deprecated,' \
|
51
|
+
' please use excel_tables instead.'
|
52
|
+
excel_tables(path, &block)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def read_excel_file(path, selected_sheet = nil)
|
58
|
+
# SECURE: TVB Mon Aug 13 15:30:32 BST 2012 SafeFile.safepath_to_string makes sure that
|
59
|
+
# the path is SafePath.
|
60
|
+
|
61
|
+
# Load the workbook
|
62
|
+
workbook = load_workbook(path)
|
63
|
+
|
64
|
+
# Choose selected worksheet (if provided and exist) or the first worksheet
|
65
|
+
workbook.default_sheet =
|
66
|
+
if selected_sheet.nil? || !workbook.sheets.include?(selected_sheet.to_s)
|
67
|
+
workbook.sheets.first
|
68
|
+
else
|
69
|
+
selected_sheet.to_s
|
70
|
+
end
|
71
|
+
|
72
|
+
# Read the cells from working worksheet into a nested array
|
73
|
+
excel_rows(workbook, workbook).to_a
|
74
|
+
end
|
75
|
+
|
76
|
+
# Iterate through the sheet line by line, yielding each one in turn.
|
77
|
+
def excel_rows(workbook, sheet, &block)
|
78
|
+
return enum_for(:excel_rows, workbook, sheet) unless block
|
79
|
+
|
80
|
+
if workbook.is_a?(Roo::Excelx)
|
81
|
+
# FIXME: xlsx_rows(sheet, &block) should produce the same output as xls_rows
|
82
|
+
xls_rows(sheet, &block)
|
83
|
+
else
|
84
|
+
xls_rows(sheet, &block)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# Deprecated method
|
89
|
+
def each_excel_row(workbook, sheet, &block)
|
90
|
+
Kernel.warn '[warning] each_excel_row will be deprecated,' \
|
91
|
+
' please use excel_rows instead.'
|
92
|
+
excel_rows(workbook, sheet, &block)
|
93
|
+
end
|
94
|
+
|
95
|
+
# Iterate through an xls sheet line by line, yielding each one in turn.
|
96
|
+
def xls_rows(sheet)
|
97
|
+
return enum_for(:xls_rows, sheet) unless block_given?
|
98
|
+
|
99
|
+
sheet.first_row.upto(sheet.last_row) do |row|
|
100
|
+
line = []
|
101
|
+
sheet.first_column.upto(sheet.last_column) do |col|
|
102
|
+
line << cast_excel_value(sheet.cell(row, col))
|
103
|
+
end
|
104
|
+
yield line
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Deprecated method
|
109
|
+
def each_xls_row(sheet, &block)
|
110
|
+
Kernel.warn '[warning] each_xls_row will be deprecated,' \
|
111
|
+
' please use xls_rows instead.'
|
112
|
+
xls_rows(sheet, &block)
|
113
|
+
end
|
114
|
+
|
115
|
+
# Iterate through an xlsx sheet line by line, yielding each one in turn.
|
116
|
+
# This method uses streaming https://github.com/roo-rb/roo#excel-xlsx-and-xlsm-support
|
117
|
+
def xlsx_rows(sheet)
|
118
|
+
return enum_for(:xlsx_rows, sheet) unless block_given?
|
119
|
+
|
120
|
+
sheet.each_row_streaming(:pad_cells => true) do |row|
|
121
|
+
yield row.map { |cell| cast_excel_value(cell.value) }
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Deprecated method
|
126
|
+
def each_xlsx_row(sheet, &block)
|
127
|
+
Kernel.warn '[warning] each_xlsx_row will be deprecated,' \
|
128
|
+
' please use xlsx_rows instead.'
|
129
|
+
xlsx_rows(sheet, &block)
|
130
|
+
end
|
131
|
+
|
132
|
+
def get_excel_sheets_name(path)
|
133
|
+
workbook = load_workbook(path)
|
134
|
+
workbook.sheets
|
135
|
+
end
|
136
|
+
|
137
|
+
def load_workbook(path)
|
138
|
+
case SafeFile.extname(path).downcase
|
139
|
+
when '.xls'
|
140
|
+
Roo::Excel.new(SafeFile.safepath_to_string(path))
|
141
|
+
when '.xlsx'
|
142
|
+
Roo::Excelx.new(SafeFile.safepath_to_string(path))
|
143
|
+
else
|
144
|
+
fail "Received file path with unexpected extension #{SafeFile.extname(path)}"
|
145
|
+
end
|
146
|
+
rescue Ole::Storage::FormatError => e
|
147
|
+
# TODO: Do we need to remove the new_file after using it?
|
148
|
+
|
149
|
+
# try to load the .xls file as an .xlsx file, useful for sources like USOM
|
150
|
+
# roo check file extensions in file_type_check (GenericSpreadsheet),
|
151
|
+
# so we create a duplicate file in xlsx extension
|
152
|
+
if /(.*)\.xls$/.match(path)
|
153
|
+
new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
|
154
|
+
new_file_path = SafeFile.dirname(path).join(new_file_name)
|
155
|
+
copy_file(path, new_file_path)
|
156
|
+
|
157
|
+
load_workbook(new_file_path)
|
158
|
+
else
|
159
|
+
raise e.message
|
160
|
+
end
|
161
|
+
rescue => e
|
162
|
+
raise ["Unable to read the file '#{path}'", e.message].join('; ')
|
163
|
+
end
|
164
|
+
|
165
|
+
# Note that this method can produce insecure calls. All callers must protect
|
166
|
+
# their arguments.
|
167
|
+
# Arguments:
|
168
|
+
# * source - SafeFile
|
169
|
+
# * dest - SafeFile
|
170
|
+
#
|
171
|
+
def copy_file(source, dest)
|
172
|
+
# SECURE: TVB Mon Aug 13 13:53:02 BST 2012 : Secure SafePath will do the security checks
|
173
|
+
# before it is converted to string.
|
174
|
+
# SafeFile will make sure that the arguments are actually SafePath
|
175
|
+
FileUtils.mkdir_p(SafeFile.safepath_to_string(SafeFile.dirname(dest)))
|
176
|
+
FileUtils.cp(SafeFile.safepath_to_string(source), SafeFile.safepath_to_string(dest))
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'ndr_support/safe_file'
|
2
|
+
|
3
|
+
module NdrImport
|
4
|
+
module Helpers
|
5
|
+
module File
|
6
|
+
# This mixin adds PDF functionality to unified importers. It provides a file reader method.
|
7
|
+
module Pdf
|
8
|
+
private
|
9
|
+
|
10
|
+
def read_pdf_file(path)
|
11
|
+
require 'pdf-reader'
|
12
|
+
lines = []
|
13
|
+
pagenum = 0
|
14
|
+
begin
|
15
|
+
reader = PDF::Reader.new(SafeFile.safepath_to_string(path))
|
16
|
+
|
17
|
+
reader.pages.each do |page|
|
18
|
+
lines.concat page.text.split("\n")
|
19
|
+
pagenum += 1
|
20
|
+
end
|
21
|
+
rescue => e
|
22
|
+
raise("Invalid format on page #{pagenum + 1} of #{SafeFile.basename(path)} [#{e.class}: #{e.message}]")
|
23
|
+
end
|
24
|
+
lines
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|