tabular-reader 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tabular-reader
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Read XLSX, XLS and CSV files with a uniform interface Read XLSX, XLS and CSV files with a uniform interface
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "tabular-reader"
3
- version = "0.1.2"
3
+ version = "0.1.4"
4
4
  description = "Read XLSX, XLS and CSV files with a uniform interface Read XLSX, XLS and CSV files with a uniform interface"
5
5
  authors = ["arkhan <arkhan@riseup.net>"]
6
6
  license = "MIT"
@@ -2,5 +2,5 @@
2
2
 
3
3
  from .reader import TabularReader
4
4
 
5
- __version__ = "0.1.2"
5
+ __version__ = "0.1.4"
6
6
  __all__ = ["TabularReader"]
@@ -17,6 +17,8 @@ def read_csv(source, **kwargs):
17
17
  k: v for k, v in kwargs.items() if k in ["delimiter", "quotechar", "encoding"]
18
18
  }
19
19
 
20
+ if isinstance(source, bytes):
21
+ source = io.BytesIO(source)
20
22
  if isinstance(source, io.BytesIO):
21
23
  source.seek(0)
22
24
  f = io.TextIOWrapper(source, encoding=csv_kwargs.pop("encoding", "utf-8-sig"))
@@ -39,9 +41,11 @@ def read_csv(source, **kwargs):
39
41
  ]
40
42
 
41
43
 
42
- def read_xls(source, worksheet="", **kwargs):
44
+ def read_xls(source, worksheet="", skip_rows=0, **kwargs):
43
45
  import xlrd
44
46
 
47
+ if isinstance(source, bytes):
48
+ source = io.BytesIO(source)
45
49
  if isinstance(source, io.BytesIO):
46
50
  source.seek(0)
47
51
  wb = xlrd.open_workbook(file_contents=source.read())
@@ -51,7 +55,8 @@ def read_xls(source, worksheet="", **kwargs):
51
55
  ws = wb.sheet_by_name(worksheet) if worksheet else wb.sheet_by_index(0)
52
56
 
53
57
  total = [[cell.value for cell in row] for row in ws.get_rows()]
54
- header = total[0] if total else []
58
+ header_idx = skip_rows if skip_rows < len(total) else 0
59
+ header = total[header_idx] if total else []
55
60
  filtered_indices = [
56
61
  i for i, val in enumerate(header) if val is not None and str(val).strip() != ""
57
62
  ]
@@ -60,13 +65,14 @@ def read_xls(source, worksheet="", **kwargs):
60
65
  ]
61
66
 
62
67
 
63
- def read_xlsx(source, worksheet="", **kwargs):
68
+ def read_xlsx(source, worksheet="", skip_rows=0, **kwargs):
64
69
  from openpyxl import load_workbook
65
70
 
66
- excel_kwargs = {
67
- k: v for k, v in kwargs.items() if k not in ["delimiter", "encoding"]
68
- }
71
+ _OPENPYXL_KWARGS = {"read_only", "keep_vba", "data_only", "keep_links", "rich_text"}
72
+ excel_kwargs = {k: v for k, v in kwargs.items() if k in _OPENPYXL_KWARGS}
69
73
 
74
+ if isinstance(source, bytes):
75
+ source = io.BytesIO(source)
70
76
  if isinstance(source, io.BytesIO):
71
77
  source.seek(0)
72
78
 
@@ -74,7 +80,8 @@ def read_xlsx(source, worksheet="", **kwargs):
74
80
  ws = worksheet and wb[worksheet] or wb.active
75
81
 
76
82
  total = [[col.value for col in row] for row in ws]
77
- header = total[0] if total else []
83
+ header_idx = skip_rows if skip_rows < len(total) else 0
84
+ header = total[header_idx] if total else []
78
85
  filtered_indices = [
79
86
  i for i, val in enumerate(header) if val is not None and str(val).strip() != ""
80
87
  ]
@@ -92,22 +99,29 @@ class TabularReader:
92
99
  restval=None,
93
100
  restkey=None,
94
101
  skip_blank_lines=False,
102
+ skip_rows=0,
103
+ skip_row=None,
95
104
  *args,
96
105
  **kwargs,
97
106
  ):
107
+ if skip_row is not None and not skip_rows:
108
+ skip_rows = skip_row
98
109
  file_format = get_file_format(source)
99
110
 
100
111
  if file_format is None:
101
- filtered_data = self._detect_and_read_bytes(source, worksheet, **kwargs)
112
+ filtered_data = self._detect_and_read_bytes(source, worksheet, skip_rows=skip_rows, **kwargs)
102
113
  elif file_format == "csv":
103
114
  filtered_data = read_csv(source, **kwargs)
104
115
  elif file_format == "xlsx":
105
- filtered_data = read_xlsx(source, worksheet, **kwargs)
116
+ filtered_data = read_xlsx(source, worksheet, skip_rows=skip_rows, **kwargs)
106
117
  elif file_format == "xls":
107
- filtered_data = read_xls(source, worksheet, **kwargs)
118
+ filtered_data = read_xls(source, worksheet, skip_rows=skip_rows, **kwargs)
108
119
  else:
109
120
  raise ValueError(f"Unsupported format: {file_format}")
110
121
 
122
+ if skip_rows:
123
+ filtered_data = filtered_data[skip_rows:]
124
+
111
125
  self.reader = iter(filtered_data)
112
126
  self._fieldnames = fieldnames
113
127
  self.restkey = restkey
@@ -115,8 +129,10 @@ class TabularReader:
115
129
  self.skip_blank_lines = skip_blank_lines
116
130
  self.line_num = 0
117
131
 
118
- def _detect_and_read_bytes(self, source, worksheet, **kwargs):
132
+ def _detect_and_read_bytes(self, source, worksheet, skip_rows=0, **kwargs):
119
133
  errors = []
134
+ if isinstance(source, bytes):
135
+ source = io.BytesIO(source)
120
136
 
121
137
  for fmt, reader_func in [
122
138
  ("xlsx", read_xlsx),
@@ -126,15 +142,14 @@ class TabularReader:
126
142
  try:
127
143
  if isinstance(source, io.BytesIO):
128
144
  source.seek(0)
129
- return reader_func(source, worksheet, **kwargs)
145
+ return reader_func(source, worksheet, skip_rows=skip_rows, **kwargs)
130
146
  except Exception as e:
131
147
  errors.append((fmt, str(e)))
132
148
  if isinstance(source, io.BytesIO):
133
149
  source.seek(0)
134
150
 
135
- raise ValueError(
136
- f"Could not detect file format. Tried: {', '.join(f[0] for f in errors)}"
137
- )
151
+ details = "; ".join(f"{fmt}: {err}" for fmt, err in errors)
152
+ raise ValueError(f"Could not detect file format. Tried: {details}")
138
153
 
139
154
  @property
140
155
  def fieldnames(self):
File without changes