tabular-reader 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tabular_reader-0.1.2 → tabular_reader-0.1.4}/PKG-INFO +1 -1
- {tabular_reader-0.1.2 → tabular_reader-0.1.4}/pyproject.toml +1 -1
- {tabular_reader-0.1.2 → tabular_reader-0.1.4}/tabular_reader/__init__.py +1 -1
- {tabular_reader-0.1.2 → tabular_reader-0.1.4}/tabular_reader/reader.py +30 -15
- {tabular_reader-0.1.2 → tabular_reader-0.1.4}/LICENSE +0 -0
- {tabular_reader-0.1.2 → tabular_reader-0.1.4}/README.org +0 -0
|
@@ -17,6 +17,8 @@ def read_csv(source, **kwargs):
|
|
|
17
17
|
k: v for k, v in kwargs.items() if k in ["delimiter", "quotechar", "encoding"]
|
|
18
18
|
}
|
|
19
19
|
|
|
20
|
+
if isinstance(source, bytes):
|
|
21
|
+
source = io.BytesIO(source)
|
|
20
22
|
if isinstance(source, io.BytesIO):
|
|
21
23
|
source.seek(0)
|
|
22
24
|
f = io.TextIOWrapper(source, encoding=csv_kwargs.pop("encoding", "utf-8-sig"))
|
|
@@ -39,9 +41,11 @@ def read_csv(source, **kwargs):
|
|
|
39
41
|
]
|
|
40
42
|
|
|
41
43
|
|
|
42
|
-
def read_xls(source, worksheet="", **kwargs):
|
|
44
|
+
def read_xls(source, worksheet="", skip_rows=0, **kwargs):
|
|
43
45
|
import xlrd
|
|
44
46
|
|
|
47
|
+
if isinstance(source, bytes):
|
|
48
|
+
source = io.BytesIO(source)
|
|
45
49
|
if isinstance(source, io.BytesIO):
|
|
46
50
|
source.seek(0)
|
|
47
51
|
wb = xlrd.open_workbook(file_contents=source.read())
|
|
@@ -51,7 +55,8 @@ def read_xls(source, worksheet="", **kwargs):
|
|
|
51
55
|
ws = wb.sheet_by_name(worksheet) if worksheet else wb.sheet_by_index(0)
|
|
52
56
|
|
|
53
57
|
total = [[cell.value for cell in row] for row in ws.get_rows()]
|
|
54
|
-
|
|
58
|
+
header_idx = skip_rows if skip_rows < len(total) else 0
|
|
59
|
+
header = total[header_idx] if total else []
|
|
55
60
|
filtered_indices = [
|
|
56
61
|
i for i, val in enumerate(header) if val is not None and str(val).strip() != ""
|
|
57
62
|
]
|
|
@@ -60,13 +65,14 @@ def read_xls(source, worksheet="", **kwargs):
|
|
|
60
65
|
]
|
|
61
66
|
|
|
62
67
|
|
|
63
|
-
def read_xlsx(source, worksheet="", **kwargs):
|
|
68
|
+
def read_xlsx(source, worksheet="", skip_rows=0, **kwargs):
|
|
64
69
|
from openpyxl import load_workbook
|
|
65
70
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
}
|
|
71
|
+
_OPENPYXL_KWARGS = {"read_only", "keep_vba", "data_only", "keep_links", "rich_text"}
|
|
72
|
+
excel_kwargs = {k: v for k, v in kwargs.items() if k in _OPENPYXL_KWARGS}
|
|
69
73
|
|
|
74
|
+
if isinstance(source, bytes):
|
|
75
|
+
source = io.BytesIO(source)
|
|
70
76
|
if isinstance(source, io.BytesIO):
|
|
71
77
|
source.seek(0)
|
|
72
78
|
|
|
@@ -74,7 +80,8 @@ def read_xlsx(source, worksheet="", **kwargs):
|
|
|
74
80
|
ws = worksheet and wb[worksheet] or wb.active
|
|
75
81
|
|
|
76
82
|
total = [[col.value for col in row] for row in ws]
|
|
77
|
-
|
|
83
|
+
header_idx = skip_rows if skip_rows < len(total) else 0
|
|
84
|
+
header = total[header_idx] if total else []
|
|
78
85
|
filtered_indices = [
|
|
79
86
|
i for i, val in enumerate(header) if val is not None and str(val).strip() != ""
|
|
80
87
|
]
|
|
@@ -92,22 +99,29 @@ class TabularReader:
|
|
|
92
99
|
restval=None,
|
|
93
100
|
restkey=None,
|
|
94
101
|
skip_blank_lines=False,
|
|
102
|
+
skip_rows=0,
|
|
103
|
+
skip_row=None,
|
|
95
104
|
*args,
|
|
96
105
|
**kwargs,
|
|
97
106
|
):
|
|
107
|
+
if skip_row is not None and not skip_rows:
|
|
108
|
+
skip_rows = skip_row
|
|
98
109
|
file_format = get_file_format(source)
|
|
99
110
|
|
|
100
111
|
if file_format is None:
|
|
101
|
-
filtered_data = self._detect_and_read_bytes(source, worksheet, **kwargs)
|
|
112
|
+
filtered_data = self._detect_and_read_bytes(source, worksheet, skip_rows=skip_rows, **kwargs)
|
|
102
113
|
elif file_format == "csv":
|
|
103
114
|
filtered_data = read_csv(source, **kwargs)
|
|
104
115
|
elif file_format == "xlsx":
|
|
105
|
-
filtered_data = read_xlsx(source, worksheet, **kwargs)
|
|
116
|
+
filtered_data = read_xlsx(source, worksheet, skip_rows=skip_rows, **kwargs)
|
|
106
117
|
elif file_format == "xls":
|
|
107
|
-
filtered_data = read_xls(source, worksheet, **kwargs)
|
|
118
|
+
filtered_data = read_xls(source, worksheet, skip_rows=skip_rows, **kwargs)
|
|
108
119
|
else:
|
|
109
120
|
raise ValueError(f"Unsupported format: {file_format}")
|
|
110
121
|
|
|
122
|
+
if skip_rows:
|
|
123
|
+
filtered_data = filtered_data[skip_rows:]
|
|
124
|
+
|
|
111
125
|
self.reader = iter(filtered_data)
|
|
112
126
|
self._fieldnames = fieldnames
|
|
113
127
|
self.restkey = restkey
|
|
@@ -115,8 +129,10 @@ class TabularReader:
|
|
|
115
129
|
self.skip_blank_lines = skip_blank_lines
|
|
116
130
|
self.line_num = 0
|
|
117
131
|
|
|
118
|
-
def _detect_and_read_bytes(self, source, worksheet, **kwargs):
|
|
132
|
+
def _detect_and_read_bytes(self, source, worksheet, skip_rows=0, **kwargs):
|
|
119
133
|
errors = []
|
|
134
|
+
if isinstance(source, bytes):
|
|
135
|
+
source = io.BytesIO(source)
|
|
120
136
|
|
|
121
137
|
for fmt, reader_func in [
|
|
122
138
|
("xlsx", read_xlsx),
|
|
@@ -126,15 +142,14 @@ class TabularReader:
|
|
|
126
142
|
try:
|
|
127
143
|
if isinstance(source, io.BytesIO):
|
|
128
144
|
source.seek(0)
|
|
129
|
-
return reader_func(source, worksheet, **kwargs)
|
|
145
|
+
return reader_func(source, worksheet, skip_rows=skip_rows, **kwargs)
|
|
130
146
|
except Exception as e:
|
|
131
147
|
errors.append((fmt, str(e)))
|
|
132
148
|
if isinstance(source, io.BytesIO):
|
|
133
149
|
source.seek(0)
|
|
134
150
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
)
|
|
151
|
+
details = "; ".join(f"{fmt}: {err}" for fmt, err in errors)
|
|
152
|
+
raise ValueError(f"Could not detect file format. Tried: {details}")
|
|
138
153
|
|
|
139
154
|
@property
|
|
140
155
|
def fieldnames(self):
|
|
File without changes
|
|
File without changes
|