ndr_import 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +14 -0
- data/.rubocop.yml +27 -0
- data/.ruby-version +1 -0
- data/.travis.yml +22 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +4 -0
- data/Guardfile +16 -0
- data/LICENSE.txt +21 -0
- data/README.md +69 -0
- data/Rakefile +13 -0
- data/code_safety.yml +374 -0
- data/gemfiles/Gemfile.rails32 +5 -0
- data/gemfiles/Gemfile.rails32.lock +142 -0
- data/gemfiles/Gemfile.rails41 +5 -0
- data/gemfiles/Gemfile.rails41.lock +145 -0
- data/gemfiles/Gemfile.rails42 +5 -0
- data/gemfiles/Gemfile.rails42.lock +145 -0
- data/lib/ndr_import.rb +13 -0
- data/lib/ndr_import/csv_library.rb +40 -0
- data/lib/ndr_import/file/all.rb +8 -0
- data/lib/ndr_import/file/base.rb +76 -0
- data/lib/ndr_import/file/delimited.rb +86 -0
- data/lib/ndr_import/file/excel.rb +131 -0
- data/lib/ndr_import/file/pdf.rb +38 -0
- data/lib/ndr_import/file/registry.rb +50 -0
- data/lib/ndr_import/file/text.rb +52 -0
- data/lib/ndr_import/file/word.rb +30 -0
- data/lib/ndr_import/file/zip.rb +67 -0
- data/lib/ndr_import/helpers/file/delimited.rb +105 -0
- data/lib/ndr_import/helpers/file/excel.rb +181 -0
- data/lib/ndr_import/helpers/file/pdf.rb +29 -0
- data/lib/ndr_import/helpers/file/word.rb +27 -0
- data/lib/ndr_import/helpers/file/xml.rb +45 -0
- data/lib/ndr_import/helpers/file/zip.rb +44 -0
- data/lib/ndr_import/mapper.rb +220 -0
- data/lib/ndr_import/mapping_error.rb +5 -0
- data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
- data/lib/ndr_import/non_tabular/line.rb +46 -0
- data/lib/ndr_import/non_tabular/mapping.rb +35 -0
- data/lib/ndr_import/non_tabular/record.rb +99 -0
- data/lib/ndr_import/non_tabular/table.rb +193 -0
- data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
- data/lib/ndr_import/standard_mappings.rb +23 -0
- data/lib/ndr_import/table.rb +179 -0
- data/lib/ndr_import/version.rb +4 -0
- data/ndr_import.gemspec +44 -0
- data/test/file/base_test.rb +54 -0
- data/test/file/delimited_test.rb +143 -0
- data/test/file/excel_test.rb +85 -0
- data/test/file/pdf_test.rb +35 -0
- data/test/file/registry_test.rb +60 -0
- data/test/file/text_test.rb +92 -0
- data/test/file/word_test.rb +35 -0
- data/test/file/zip_test.rb +47 -0
- data/test/helpers/file/delimited_test.rb +113 -0
- data/test/helpers/file/excel_test.rb +97 -0
- data/test/helpers/file/pdf_test.rb +26 -0
- data/test/helpers/file/word_test.rb +26 -0
- data/test/helpers/file/xml_test.rb +131 -0
- data/test/helpers/file/zip_test.rb +75 -0
- data/test/mapper_test.rb +551 -0
- data/test/non_tabular/mapping_test.rb +36 -0
- data/test/non_tabular/table_test.rb +510 -0
- data/test/non_tabular_file_helper_test.rb +501 -0
- data/test/readme_test.rb +53 -0
- data/test/resources/bomd.csv +3 -0
- data/test/resources/broken.csv +3 -0
- data/test/resources/filesystem_paths.yml +26 -0
- data/test/resources/flat_file.pdf +0 -0
- data/test/resources/flat_file.txt +27 -0
- data/test/resources/flat_file.yml +20 -0
- data/test/resources/hello_utf16be.txt +0 -0
- data/test/resources/hello_utf16le.txt +0 -0
- data/test/resources/hello_utf8.txt +2 -0
- data/test/resources/hello_windows.txt +2 -0
- data/test/resources/hello_world.doc +0 -0
- data/test/resources/hello_world.pdf +0 -0
- data/test/resources/hello_world.txt +2 -0
- data/test/resources/high_ascii_delimited.txt +2 -0
- data/test/resources/malformed.xml +6 -0
- data/test/resources/normal.csv +3 -0
- data/test/resources/normal.csv.zip +0 -0
- data/test/resources/normal_pipe.csv +3 -0
- data/test/resources/normal_thorn.csv +3 -0
- data/test/resources/not_a_pdf.pdf +0 -0
- data/test/resources/not_a_word_file.doc +0 -0
- data/test/resources/sample_xls.xls +0 -0
- data/test/resources/sample_xlsx.xlsx +0 -0
- data/test/resources/standard_mappings.yml +39 -0
- data/test/resources/txt_file_xls_extension.xls +1 -0
- data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
- data/test/resources/utf-16be_xml.xml +0 -0
- data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
- data/test/resources/utf-16le_xml.xml +0 -0
- data/test/resources/utf-8_xml.xml +9 -0
- data/test/resources/windows-1252_xml.xml +9 -0
- data/test/resources/windows.csv +5 -0
- data/test/resources/xlsx_file_xls_extension.xls +0 -0
- data/test/standard_mappings_test.rb +22 -0
- data/test/table_test.rb +288 -0
- data/test/test_helper.rb +13 -0
- metadata +443 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
require 'ndr_support/safe_file'
|
|
2
|
+
require 'ndr_support/utf8_encoding'
|
|
3
|
+
require_relative 'registry'
|
|
4
|
+
|
|
5
|
+
module NdrImport
|
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
|
8
|
+
module File
|
|
9
|
+
# This class is a text file handler that returns a single table.
|
|
10
|
+
class Text < Base
|
|
11
|
+
include UTF8Encoding
|
|
12
|
+
|
|
13
|
+
private
|
|
14
|
+
|
|
15
|
+
def rows(&block)
|
|
16
|
+
return enum_for(:rows) unless block
|
|
17
|
+
|
|
18
|
+
# Encoding:
|
|
19
|
+
# As we're going to be yielding the lines of the file as it is streamed
|
|
20
|
+
# (rather than slurped in advance), we need to know which encoding / mode
|
|
21
|
+
# is going to work in advance.
|
|
22
|
+
#
|
|
23
|
+
path = SafeFile.safepath_to_string(@filename)
|
|
24
|
+
mode = read_mode_for(path)
|
|
25
|
+
|
|
26
|
+
# SECURE: TG 13 Oct 2015 SafeFile.safepath_to_string ensures that the path is SafePath.
|
|
27
|
+
::File.new(path, mode).each { |line| block.call ensure_utf8!(line).chomp }
|
|
28
|
+
rescue => e
|
|
29
|
+
raise "Failed to read #{SafeFile.basename(@filename)} as text [#{e.class}: #{e.message}]"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# TODO: In Ruby 2.0+, a mode of "rb:bom|utf-16:utf-8" seemed to fix all cases,
|
|
33
|
+
# but this doesn't work on Ruby 1.9.3, which we are currently still supporting.
|
|
34
|
+
# Therefore, we have to test multiple modes in advance, hence #read_mode_for.
|
|
35
|
+
def read_mode_for(trusted_path)
|
|
36
|
+
# These are the read modes we will try, in order:
|
|
37
|
+
modes = ['rb:utf-16:utf-8', 'r:utf-8']
|
|
38
|
+
|
|
39
|
+
begin
|
|
40
|
+
::File.new(trusted_path, modes.first).each { |_line| }
|
|
41
|
+
rescue Encoding::InvalidByteSequenceError
|
|
42
|
+
modes.shift # That one didn't work...
|
|
43
|
+
retry if modes.any?
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
modes.first || fail('Unable to determine working stream encoding!')
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
Registry.register(Text, 'txt') # TODO: Add 'nontabular'?
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'msworddoc-extractor'
|
|
2
|
+
require 'ndr_support/safe_file'
|
|
3
|
+
require_relative 'registry'
|
|
4
|
+
|
|
5
|
+
module NdrImport
|
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
|
8
|
+
module File
|
|
9
|
+
# This class is a Word document file handler that returns a single table.
|
|
10
|
+
# currently only works on .doc (97-2003), not.docx
|
|
11
|
+
class Word < Base
|
|
12
|
+
private
|
|
13
|
+
|
|
14
|
+
def rows(&block)
|
|
15
|
+
return enum_for(:rows) unless block
|
|
16
|
+
|
|
17
|
+
doc = MSWordDoc::Extractor.load(SafeFile.safepath_to_string(@filename))
|
|
18
|
+
|
|
19
|
+
doc.whole_contents.split("\n").each do |line|
|
|
20
|
+
block.call(line)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
rescue => e
|
|
24
|
+
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
Registry.register(Word, 'doc') # TODO: Add 'word'?
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
require 'zip'
|
|
2
|
+
require 'ndr_support/safe_file'
|
|
3
|
+
require_relative 'registry'
|
|
4
|
+
|
|
5
|
+
module NdrImport
|
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
|
8
|
+
module File
|
|
9
|
+
# This class is a zip file handler that returns tables from the extracted files.
|
|
10
|
+
class Zip < Base
|
|
11
|
+
def initialize(filename, format, options = {})
|
|
12
|
+
super
|
|
13
|
+
@pattern = options['pattern'] || //
|
|
14
|
+
@unzip_path = options['unzip_path']
|
|
15
|
+
|
|
16
|
+
validate_unzip_path_is_safe!
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def files(&block)
|
|
20
|
+
fail 'Not allowed in external environment' if defined?(::Rails) && ::Rails.env.external?
|
|
21
|
+
|
|
22
|
+
return enum_for(:files) unless block
|
|
23
|
+
|
|
24
|
+
destination = @unzip_path.join(Time.current.strftime('%H%M%S%L'))
|
|
25
|
+
FileUtils.mkdir_p(SafeFile.safepath_to_string(destination))
|
|
26
|
+
|
|
27
|
+
::Zip::File.open(SafeFile.safepath_to_string(@filename)) do |zipfile|
|
|
28
|
+
unzip_entries(zipfile, destination, &block)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Zip files produce files, never tables.
|
|
33
|
+
def tables
|
|
34
|
+
fail 'Zip#tables should never be called'
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
# Unzip the zip file entry and enumerate over it
|
|
40
|
+
def unzip_entries(zipfile, destination, &block)
|
|
41
|
+
zipfile.entries.each do |entry|
|
|
42
|
+
# SECURE: TPG 2010-11-1: The path is stripped from the zipfile entry when extracted
|
|
43
|
+
basename = ::File.basename(entry.name)
|
|
44
|
+
next unless entry.file? && basename.match(@pattern)
|
|
45
|
+
|
|
46
|
+
unzipped_filename = destination.join(basename)
|
|
47
|
+
zipfile.extract(entry, unzipped_filename)
|
|
48
|
+
|
|
49
|
+
unzipped_files(unzipped_filename, &block)
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Enumerate over an unzipped file like any other
|
|
54
|
+
def unzipped_files(unzipped_filename, &block)
|
|
55
|
+
Registry.files(unzipped_filename, @options).each do |filename|
|
|
56
|
+
block.call(filename)
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def validate_unzip_path_is_safe!
|
|
61
|
+
SafeFile.safepath_to_string(@unzip_path)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
Registry.register(Zip, 'zip')
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
require 'ndr_support/safe_file'
|
|
2
|
+
require 'ndr_import/csv_library'
|
|
3
|
+
|
|
4
|
+
module NdrImport
|
|
5
|
+
module Helpers
|
|
6
|
+
module File
|
|
7
|
+
# This mixin adds delimited file functionality to unified importers.
|
|
8
|
+
module Delimited
|
|
9
|
+
# Read a plain text CSV file, return an array of the content
|
|
10
|
+
def read_csv_file(path)
|
|
11
|
+
# Read the page below when encountering "CSV::IllegalFormatError" error caused by CSV
|
|
12
|
+
# file generated at MAC OS
|
|
13
|
+
# http://stackoverflow.com/questions/1549139/ruby-cannot-parse-excel-file-exported-as-csv-in-os-x
|
|
14
|
+
|
|
15
|
+
read_delimited_file(path)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Slurp the entire file into an array of lines.
|
|
19
|
+
def read_delimited_file(path, col_sep = nil)
|
|
20
|
+
delimited_rows(path, col_sep).to_a
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Iterate through the file table by table, yielding each one in turn.
|
|
24
|
+
def delimited_tables(path, col_sep = nil)
|
|
25
|
+
return enum_for(:delimited_tables, path, col_sep) unless block_given?
|
|
26
|
+
|
|
27
|
+
yield nil, delimited_rows(path, col_sep)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Deprecated method
|
|
31
|
+
def each_delimited_table(path, col_sep = nil, &block)
|
|
32
|
+
Kernel.warn '[warning] each_delimited_table will be deprecated,' \
|
|
33
|
+
' please use delimited_tables instead.'
|
|
34
|
+
delimited_tables(path, col_sep, &block)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Iterate through the file line by line, yielding each one in turn.
|
|
38
|
+
def delimited_rows(path, col_sep = nil)
|
|
39
|
+
return enum_for(:delimited_rows, path, col_sep) unless block_given?
|
|
40
|
+
|
|
41
|
+
safe_path = SafeFile.safepath_to_string(path)
|
|
42
|
+
encodings = determine_encodings!(safe_path, col_sep)
|
|
43
|
+
|
|
44
|
+
# By now, we know `encodings` should let us read the whole
|
|
45
|
+
# file succesfully; if there are problems, we should crash.
|
|
46
|
+
CSVLibrary.foreach(safe_path, encodings) do |line|
|
|
47
|
+
yield line.map(&:to_s) unless line.length <= 5
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Deprecated method
|
|
52
|
+
def each_delimited_row(path, col_sep = nil, &block)
|
|
53
|
+
Kernel.warn '[warning] each_delimited_row will be deprecated,' \
|
|
54
|
+
' please use delimited_rows instead.'
|
|
55
|
+
delimited_rows(path, col_sep, &block)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
# Derive the source encoding by trying all supported encodings.
|
|
61
|
+
# Returns first set of working options, or raises if none could be found.
|
|
62
|
+
def determine_encodings!(safe_path, col_sep = nil)
|
|
63
|
+
# delimiter encoding => # FasterCSV encoding string
|
|
64
|
+
supported_encodings = {
|
|
65
|
+
'UTF-8' => 'bom|utf-8',
|
|
66
|
+
'Windows-1252' => 'windows-1252:utf-8'
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
successful_options = nil
|
|
70
|
+
supported_encodings.each do |delimiter_encoding, csv_encoding|
|
|
71
|
+
begin
|
|
72
|
+
options = {
|
|
73
|
+
:col_sep => (col_sep || ',').force_encoding(delimiter_encoding),
|
|
74
|
+
:encoding => csv_encoding
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
row_num = 0
|
|
78
|
+
# Iterate through the file; if we reach the end, this encoding worked:
|
|
79
|
+
CSVLibrary.foreach(safe_path, options) { |_line| row_num += 1 }
|
|
80
|
+
rescue ArgumentError => e
|
|
81
|
+
next if e.message =~ /invalid byte sequence/ # This encoding didn't work
|
|
82
|
+
raise(e)
|
|
83
|
+
rescue CSVLibrary::MalformedCSVError => e
|
|
84
|
+
description = (col_sep ? col_sep.inspect + ' delimited' : 'CSV')
|
|
85
|
+
|
|
86
|
+
raise(CSVLibrary::MalformedCSVError, "Invalid #{description} format " \
|
|
87
|
+
"on row #{row_num + 1} of #{::File.basename(safe_path)}. Original: #{e.message}")
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# We got this far => encoding choice worked:
|
|
91
|
+
successful_options = options
|
|
92
|
+
break
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# We tried them all, and none worked:
|
|
96
|
+
unless successful_options
|
|
97
|
+
fail "None of the encodings #{supported_encodings.values.inspect} were successful!"
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
successful_options
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
require 'ndr_support/safe_file'
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module Helpers
|
|
5
|
+
module File
|
|
6
|
+
# This mixin adds excel spreadsheet functionality to unified importers.
|
|
7
|
+
# It provides a file reader method and methods to cast raw values
|
|
8
|
+
# appropriately. These methods can be overridden or aliased as required.
|
|
9
|
+
#
|
|
10
|
+
module Excel
|
|
11
|
+
require 'roo'
|
|
12
|
+
require 'roo-xls'
|
|
13
|
+
require 'ole/storage'
|
|
14
|
+
|
|
15
|
+
protected
|
|
16
|
+
|
|
17
|
+
def cast_excel_value(raw_value)
|
|
18
|
+
return raw_value if raw_value.nil?
|
|
19
|
+
|
|
20
|
+
if raw_value.is_a?(Date) || raw_value.is_a?(DateTime) || raw_value.is_a?(Time)
|
|
21
|
+
cast_excel_datetime_as_date(raw_value)
|
|
22
|
+
elsif raw_value.is_a?(Float)
|
|
23
|
+
if raw_value.to_f == raw_value.to_i
|
|
24
|
+
# Whole number
|
|
25
|
+
return raw_value.to_i.to_s
|
|
26
|
+
else
|
|
27
|
+
return raw_value.to_f.to_s
|
|
28
|
+
end
|
|
29
|
+
else
|
|
30
|
+
return raw_value.to_s.strip
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def cast_excel_datetime_as_date(raw_value)
|
|
35
|
+
raw_value.to_s(:db)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Iterate through the file table by table, yielding each one in turn.
|
|
39
|
+
def excel_tables(path)
|
|
40
|
+
return enum_for(:excel_tables, path) unless block_given?
|
|
41
|
+
|
|
42
|
+
workbook = load_workbook(path)
|
|
43
|
+
workbook.each_with_pagename do |tablename, sheet|
|
|
44
|
+
yield tablename, excel_rows(workbook, sheet)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Deprecated method
|
|
49
|
+
def each_excel_table(path, &block)
|
|
50
|
+
Kernel.warn '[warning] each_excel_table will be deprecated,' \
|
|
51
|
+
' please use excel_tables instead.'
|
|
52
|
+
excel_tables(path, &block)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def read_excel_file(path, selected_sheet = nil)
|
|
58
|
+
# SECURE: TVB Mon Aug 13 15:30:32 BST 2012 SafeFile.safepath_to_string makes sure that
|
|
59
|
+
# the path is SafePath.
|
|
60
|
+
|
|
61
|
+
# Load the workbook
|
|
62
|
+
workbook = load_workbook(path)
|
|
63
|
+
|
|
64
|
+
# Choose selected worksheet (if provided and exist) or the first worksheet
|
|
65
|
+
workbook.default_sheet =
|
|
66
|
+
if selected_sheet.nil? || !workbook.sheets.include?(selected_sheet.to_s)
|
|
67
|
+
workbook.sheets.first
|
|
68
|
+
else
|
|
69
|
+
selected_sheet.to_s
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Read the cells from working worksheet into a nested array
|
|
73
|
+
excel_rows(workbook, workbook).to_a
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Iterate through the sheet line by line, yielding each one in turn.
|
|
77
|
+
def excel_rows(workbook, sheet, &block)
|
|
78
|
+
return enum_for(:excel_rows, workbook, sheet) unless block
|
|
79
|
+
|
|
80
|
+
if workbook.is_a?(Roo::Excelx)
|
|
81
|
+
# FIXME: xlsx_rows(sheet, &block) should produce the same output as xls_rows
|
|
82
|
+
xls_rows(sheet, &block)
|
|
83
|
+
else
|
|
84
|
+
xls_rows(sheet, &block)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Deprecated method
|
|
89
|
+
def each_excel_row(workbook, sheet, &block)
|
|
90
|
+
Kernel.warn '[warning] each_excel_row will be deprecated,' \
|
|
91
|
+
' please use excel_rows instead.'
|
|
92
|
+
excel_rows(workbook, sheet, &block)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Iterate through an xls sheet line by line, yielding each one in turn.
|
|
96
|
+
def xls_rows(sheet)
|
|
97
|
+
return enum_for(:xls_rows, sheet) unless block_given?
|
|
98
|
+
|
|
99
|
+
sheet.first_row.upto(sheet.last_row) do |row|
|
|
100
|
+
line = []
|
|
101
|
+
sheet.first_column.upto(sheet.last_column) do |col|
|
|
102
|
+
line << cast_excel_value(sheet.cell(row, col))
|
|
103
|
+
end
|
|
104
|
+
yield line
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Deprecated method
|
|
109
|
+
def each_xls_row(sheet, &block)
|
|
110
|
+
Kernel.warn '[warning] each_xls_row will be deprecated,' \
|
|
111
|
+
' please use xls_rows instead.'
|
|
112
|
+
xls_rows(sheet, &block)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Iterate through an xlsx sheet line by line, yielding each one in turn.
|
|
116
|
+
# This method uses streaming https://github.com/roo-rb/roo#excel-xlsx-and-xlsm-support
|
|
117
|
+
def xlsx_rows(sheet)
|
|
118
|
+
return enum_for(:xlsx_rows, sheet) unless block_given?
|
|
119
|
+
|
|
120
|
+
sheet.each_row_streaming(:pad_cells => true) do |row|
|
|
121
|
+
yield row.map { |cell| cast_excel_value(cell.value) }
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Deprecated method
|
|
126
|
+
def each_xlsx_row(sheet, &block)
|
|
127
|
+
Kernel.warn '[warning] each_xlsx_row will be deprecated,' \
|
|
128
|
+
' please use xlsx_rows instead.'
|
|
129
|
+
xlsx_rows(sheet, &block)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def get_excel_sheets_name(path)
|
|
133
|
+
workbook = load_workbook(path)
|
|
134
|
+
workbook.sheets
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def load_workbook(path)
|
|
138
|
+
case SafeFile.extname(path).downcase
|
|
139
|
+
when '.xls'
|
|
140
|
+
Roo::Excel.new(SafeFile.safepath_to_string(path))
|
|
141
|
+
when '.xlsx'
|
|
142
|
+
Roo::Excelx.new(SafeFile.safepath_to_string(path))
|
|
143
|
+
else
|
|
144
|
+
fail "Received file path with unexpected extension #{SafeFile.extname(path)}"
|
|
145
|
+
end
|
|
146
|
+
rescue Ole::Storage::FormatError => e
|
|
147
|
+
# TODO: Do we need to remove the new_file after using it?
|
|
148
|
+
|
|
149
|
+
# try to load the .xls file as an .xlsx file, useful for sources like USOM
|
|
150
|
+
# roo check file extensions in file_type_check (GenericSpreadsheet),
|
|
151
|
+
# so we create a duplicate file in xlsx extension
|
|
152
|
+
if /(.*)\.xls$/.match(path)
|
|
153
|
+
new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
|
|
154
|
+
new_file_path = SafeFile.dirname(path).join(new_file_name)
|
|
155
|
+
copy_file(path, new_file_path)
|
|
156
|
+
|
|
157
|
+
load_workbook(new_file_path)
|
|
158
|
+
else
|
|
159
|
+
raise e.message
|
|
160
|
+
end
|
|
161
|
+
rescue => e
|
|
162
|
+
raise ["Unable to read the file '#{path}'", e.message].join('; ')
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Note that this method can produce insecure calls. All callers must protect
|
|
166
|
+
# their arguments.
|
|
167
|
+
# Arguments:
|
|
168
|
+
# * source - SafeFile
|
|
169
|
+
# * dest - SafeFile
|
|
170
|
+
#
|
|
171
|
+
def copy_file(source, dest)
|
|
172
|
+
# SECURE: TVB Mon Aug 13 13:53:02 BST 2012 : Secure SafePath will do the security checks
|
|
173
|
+
# before it is converted to string.
|
|
174
|
+
# SafeFile will make sure that the arguments are actually SafePath
|
|
175
|
+
FileUtils.mkdir_p(SafeFile.safepath_to_string(SafeFile.dirname(dest)))
|
|
176
|
+
FileUtils.cp(SafeFile.safepath_to_string(source), SafeFile.safepath_to_string(dest))
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'ndr_support/safe_file'
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module Helpers
|
|
5
|
+
module File
|
|
6
|
+
# This mixin adds PDF functionality to unified importers. It provides a file reader method.
|
|
7
|
+
module Pdf
|
|
8
|
+
private
|
|
9
|
+
|
|
10
|
+
def read_pdf_file(path)
|
|
11
|
+
require 'pdf-reader'
|
|
12
|
+
lines = []
|
|
13
|
+
pagenum = 0
|
|
14
|
+
begin
|
|
15
|
+
reader = PDF::Reader.new(SafeFile.safepath_to_string(path))
|
|
16
|
+
|
|
17
|
+
reader.pages.each do |page|
|
|
18
|
+
lines.concat page.text.split("\n")
|
|
19
|
+
pagenum += 1
|
|
20
|
+
end
|
|
21
|
+
rescue => e
|
|
22
|
+
raise("Invalid format on page #{pagenum + 1} of #{SafeFile.basename(path)} [#{e.class}: #{e.message}]")
|
|
23
|
+
end
|
|
24
|
+
lines
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|