acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2629 -2543
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +46 -6
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +9 -6
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/redshift.py +19 -106
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/constants.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +62 -3
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/ingestion/source/unity/config.py +74 -9
- datahub/ingestion/source/unity/proxy.py +167 -5
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +24 -0
- datahub/ingestion/source/unity/report.py +5 -0
- datahub/ingestion/source/unity/source.py +111 -1
- datahub/ingestion/source/usage/usage_common.py +1 -0
- datahub/metadata/_internal_schema_classes.py +573 -517
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18564 -18484
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/LogicalParent.avsc +104 -100
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/chart.py +36 -22
- datahub/sdk/dashboard.py +38 -62
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,539 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from io import BytesIO
|
|
6
|
+
from typing import Any, Dict, Iterator, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
import openpyxl
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from openpyxl.workbook import Workbook
|
|
11
|
+
|
|
12
|
+
from datahub.ingestion.source.excel.report import ExcelSourceReport
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ExcelTable:
|
|
19
|
+
df: pd.DataFrame
|
|
20
|
+
header_row: int
|
|
21
|
+
footer_row: int
|
|
22
|
+
row_count: int
|
|
23
|
+
column_count: int
|
|
24
|
+
metadata: Dict[str, Any]
|
|
25
|
+
sheet_name: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ExcelFile:
|
|
29
|
+
wb: Workbook
|
|
30
|
+
filename: str
|
|
31
|
+
data: BytesIO
|
|
32
|
+
sheet_list: List[str]
|
|
33
|
+
active_sheet: str
|
|
34
|
+
properties: Dict[str, Any]
|
|
35
|
+
report: ExcelSourceReport
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
filename: str,
|
|
40
|
+
data: BytesIO,
|
|
41
|
+
report: ExcelSourceReport,
|
|
42
|
+
) -> None:
|
|
43
|
+
self.filename = filename
|
|
44
|
+
self.data = data
|
|
45
|
+
self.report = report
|
|
46
|
+
self.sheet_list = []
|
|
47
|
+
self.active_sheet = ""
|
|
48
|
+
self.properties = {}
|
|
49
|
+
|
|
50
|
+
def load_workbook(self) -> bool:
|
|
51
|
+
try:
|
|
52
|
+
self.wb = openpyxl.load_workbook(self.data, data_only=True)
|
|
53
|
+
self.properties = self.read_excel_properties(self.wb)
|
|
54
|
+
self.sheet_list = self.wb.sheetnames
|
|
55
|
+
self.active_sheet = self.wb.active.title
|
|
56
|
+
return True
|
|
57
|
+
except Exception as e:
|
|
58
|
+
self.report.report_file_dropped(self.filename)
|
|
59
|
+
self.report.warning(
|
|
60
|
+
message="Error reading Excel file",
|
|
61
|
+
context=f"Filename={self.filename}",
|
|
62
|
+
exc=e,
|
|
63
|
+
)
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def sheet_names(self) -> List[str]:
|
|
68
|
+
return self.sheet_list
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def active_sheet_name(self) -> str:
|
|
72
|
+
return self.active_sheet
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def workbook_properties(self) -> Dict[str, Any]:
|
|
76
|
+
return self.properties
|
|
77
|
+
|
|
78
|
+
def get_tables(self, active_only: Optional[bool] = False) -> Iterator[ExcelTable]:
|
|
79
|
+
sheet_list = [self.active_sheet] if active_only else self.sheet_list
|
|
80
|
+
for sheet in sheet_list:
|
|
81
|
+
table = self.get_table(sheet)
|
|
82
|
+
if table is not None:
|
|
83
|
+
yield table
|
|
84
|
+
else:
|
|
85
|
+
self.report.report_worksheet_dropped(sheet)
|
|
86
|
+
self.report.warning(
|
|
87
|
+
message="Worksheet does not contain a table",
|
|
88
|
+
context=f"Worksheet=[{self.filename}]{sheet}",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def get_table(self, sheet_name: str) -> Union[ExcelTable, None]:
|
|
92
|
+
sheet = self.wb[sheet_name]
|
|
93
|
+
|
|
94
|
+
# Extract all rows from the sheet
|
|
95
|
+
rows = [[cell.value for cell in row] for row in sheet.rows]
|
|
96
|
+
|
|
97
|
+
# Find a potential header row
|
|
98
|
+
header_row_idx = self.find_header_row(rows)
|
|
99
|
+
if header_row_idx is None:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
# Find where the footer starts
|
|
103
|
+
footer_start_idx = self.find_footer_start(rows, header_row_idx)
|
|
104
|
+
|
|
105
|
+
# Extract metadata before the header
|
|
106
|
+
header_metadata = self.extract_metadata(rows[:header_row_idx])
|
|
107
|
+
|
|
108
|
+
# Extract footer metadata
|
|
109
|
+
footer_metadata = {}
|
|
110
|
+
if footer_start_idx < len(rows):
|
|
111
|
+
footer_metadata = self.extract_metadata(rows[footer_start_idx:])
|
|
112
|
+
|
|
113
|
+
# Combine metadata
|
|
114
|
+
metadata = {}
|
|
115
|
+
metadata.update(self.properties)
|
|
116
|
+
|
|
117
|
+
# Add header metadata
|
|
118
|
+
for key, value in header_metadata.items():
|
|
119
|
+
if key not in metadata:
|
|
120
|
+
metadata[key] = value
|
|
121
|
+
else:
|
|
122
|
+
metadata[f"{key}_1"] = value
|
|
123
|
+
|
|
124
|
+
# Add footer metadata
|
|
125
|
+
for key, value in footer_metadata.items():
|
|
126
|
+
if key not in metadata:
|
|
127
|
+
metadata[key] = value
|
|
128
|
+
else:
|
|
129
|
+
metadata[f"{key}_1"] = value
|
|
130
|
+
|
|
131
|
+
# Get the header row
|
|
132
|
+
header_row = rows[header_row_idx]
|
|
133
|
+
|
|
134
|
+
# Find the last non-empty column in the header row
|
|
135
|
+
last_non_empty_idx = -1
|
|
136
|
+
for i in range(len(header_row) - 1, -1, -1):
|
|
137
|
+
if header_row[i] is not None and str(header_row[i]).strip() != "":
|
|
138
|
+
last_non_empty_idx = i
|
|
139
|
+
break
|
|
140
|
+
|
|
141
|
+
# Truncate the header row to remove empty trailing columns
|
|
142
|
+
if last_non_empty_idx >= 0:
|
|
143
|
+
header_row = header_row[: last_non_empty_idx + 1]
|
|
144
|
+
|
|
145
|
+
# Create the column names for the DataFrame
|
|
146
|
+
column_names: List[str] = []
|
|
147
|
+
seen_columns: Dict[str, int] = {}
|
|
148
|
+
for i, col in enumerate(header_row):
|
|
149
|
+
if col is None or str(col).strip() == "":
|
|
150
|
+
col_name = f"Unnamed_{i}"
|
|
151
|
+
else:
|
|
152
|
+
col_name = str(col).strip()
|
|
153
|
+
|
|
154
|
+
if col_name in seen_columns:
|
|
155
|
+
seen_columns[col_name] += 1
|
|
156
|
+
col_name = f"{col_name}_{seen_columns[col_name]}"
|
|
157
|
+
else:
|
|
158
|
+
seen_columns[col_name] = 0
|
|
159
|
+
|
|
160
|
+
column_names.append(col_name)
|
|
161
|
+
|
|
162
|
+
# Create the DataFrame with the table data
|
|
163
|
+
data_rows = rows[header_row_idx + 1 : footer_start_idx]
|
|
164
|
+
|
|
165
|
+
# Truncate data rows to match the header length
|
|
166
|
+
truncated_data_rows = [
|
|
167
|
+
row[: len(column_names)] if len(row) > len(column_names) else row
|
|
168
|
+
for row in data_rows
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
# Create the final DataFrame
|
|
172
|
+
df = pd.DataFrame(truncated_data_rows, columns=column_names)
|
|
173
|
+
|
|
174
|
+
row_count = df.shape[0]
|
|
175
|
+
column_count = df.shape[1]
|
|
176
|
+
|
|
177
|
+
return ExcelTable(
|
|
178
|
+
df,
|
|
179
|
+
header_row_idx + 1,
|
|
180
|
+
footer_start_idx,
|
|
181
|
+
row_count,
|
|
182
|
+
column_count,
|
|
183
|
+
metadata,
|
|
184
|
+
sheet.title.strip(),
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
def find_header_row(self, rows: List[List[Any]]) -> Union[int, None]:
|
|
188
|
+
max_score = -1
|
|
189
|
+
header_idx = 0
|
|
190
|
+
|
|
191
|
+
# Skip empty rows at the beginning
|
|
192
|
+
start_idx = self._find_first_non_empty_row(rows)
|
|
193
|
+
|
|
194
|
+
# Evaluate each potential header row with a lookahead
|
|
195
|
+
min_rows_required = 2
|
|
196
|
+
|
|
197
|
+
# Skip evaluation if there aren't enough rows
|
|
198
|
+
if len(rows) < start_idx + min_rows_required + 1:
|
|
199
|
+
return header_idx
|
|
200
|
+
|
|
201
|
+
for i in range(start_idx, len(rows) - min_rows_required):
|
|
202
|
+
current_row = rows[i]
|
|
203
|
+
# Take as many next rows as available, up to 3
|
|
204
|
+
next_rows = rows[i + 1 : min(i + 4, len(rows))]
|
|
205
|
+
|
|
206
|
+
# Skip empty rows
|
|
207
|
+
if not self._is_non_empty_row(current_row):
|
|
208
|
+
continue
|
|
209
|
+
|
|
210
|
+
score = self._calculate_header_row_score(i, current_row, next_rows, rows)
|
|
211
|
+
|
|
212
|
+
if score > max_score:
|
|
213
|
+
max_score = score
|
|
214
|
+
header_idx = i
|
|
215
|
+
|
|
216
|
+
if max_score <= 0:
|
|
217
|
+
return None
|
|
218
|
+
else:
|
|
219
|
+
return header_idx
|
|
220
|
+
|
|
221
|
+
def _find_first_non_empty_row(self, rows: List[List[Any]]) -> int:
|
|
222
|
+
for i, row in enumerate(rows):
|
|
223
|
+
if self._is_non_empty_row(row):
|
|
224
|
+
return i
|
|
225
|
+
return 0
|
|
226
|
+
|
|
227
|
+
@staticmethod
|
|
228
|
+
def _is_non_empty_row(row: List[Any]) -> bool:
|
|
229
|
+
return any(cell is not None and str(cell).strip() != "" for cell in row)
|
|
230
|
+
|
|
231
|
+
def _calculate_header_row_score(
|
|
232
|
+
self,
|
|
233
|
+
row_idx: int,
|
|
234
|
+
current_row: List[Any],
|
|
235
|
+
next_rows: List[List[Any]],
|
|
236
|
+
all_rows: List[List[Any]],
|
|
237
|
+
) -> int:
|
|
238
|
+
score = 0
|
|
239
|
+
|
|
240
|
+
score += ExcelFile._score_row_with_numeric_cells(current_row)
|
|
241
|
+
if score < 0:
|
|
242
|
+
return score
|
|
243
|
+
score += self._score_non_empty_cells(row_idx, current_row, all_rows)
|
|
244
|
+
score += self._score_header_like_text(current_row)
|
|
245
|
+
score += self._score_text_followed_by_numeric(current_row, next_rows)
|
|
246
|
+
score += self._score_column_type_consistency(current_row, next_rows)
|
|
247
|
+
score += self._score_metadata_patterns(row_idx, current_row, all_rows)
|
|
248
|
+
|
|
249
|
+
return score
|
|
250
|
+
|
|
251
|
+
@staticmethod
|
|
252
|
+
def _score_non_empty_cells(
|
|
253
|
+
row_idx: int, current_row: List[Any], all_rows: List[List[Any]]
|
|
254
|
+
) -> int:
|
|
255
|
+
if row_idx <= 0:
|
|
256
|
+
return 0
|
|
257
|
+
|
|
258
|
+
non_empty_current = sum(
|
|
259
|
+
1 for cell in current_row if cell is not None and str(cell).strip() != ""
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
non_empty_prev = sum(
|
|
263
|
+
1
|
|
264
|
+
for cell in all_rows[row_idx - 1]
|
|
265
|
+
if cell is not None and str(cell).strip() != ""
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
return 2 if non_empty_current > non_empty_prev else 0
|
|
269
|
+
|
|
270
|
+
@staticmethod
|
|
271
|
+
def _score_header_like_text(row: List[Any]) -> int:
|
|
272
|
+
return sum(
|
|
273
|
+
1
|
|
274
|
+
for cell in row
|
|
275
|
+
if cell is not None
|
|
276
|
+
and isinstance(cell, str)
|
|
277
|
+
and re.match(r"^[A-Z][a-zA-Z\s]*$", str(cell).strip())
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
@staticmethod
|
|
281
|
+
def _score_row_with_numeric_cells(row: List[Any]) -> int:
|
|
282
|
+
return sum(
|
|
283
|
+
-1 for cell in row if cell is not None and isinstance(cell, (int, float))
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
@staticmethod
|
|
287
|
+
def _score_text_followed_by_numeric(
|
|
288
|
+
current_row: List[Any], next_rows: List[List[Any]]
|
|
289
|
+
) -> int:
|
|
290
|
+
if not next_rows:
|
|
291
|
+
return 0
|
|
292
|
+
|
|
293
|
+
header_text_count = sum(
|
|
294
|
+
1 for cell in current_row if cell is not None and isinstance(cell, str)
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
next_rows_numeric_count = [
|
|
298
|
+
sum(
|
|
299
|
+
1
|
|
300
|
+
for cell in row
|
|
301
|
+
if cell is not None
|
|
302
|
+
and (
|
|
303
|
+
isinstance(cell, (int, float))
|
|
304
|
+
or (
|
|
305
|
+
isinstance(cell, str)
|
|
306
|
+
and re.match(r"^-?\d+(\.\d+)?$", str(cell).strip())
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
)
|
|
310
|
+
for row in next_rows
|
|
311
|
+
]
|
|
312
|
+
|
|
313
|
+
if header_text_count > 0 and any(
|
|
314
|
+
count > 0 for count in next_rows_numeric_count
|
|
315
|
+
):
|
|
316
|
+
return 6 + sum(1 for count in next_rows_numeric_count if count > 0)
|
|
317
|
+
return 0
|
|
318
|
+
|
|
319
|
+
@staticmethod
|
|
320
|
+
def _score_column_type_consistency(
|
|
321
|
+
current_row: List[Any], next_rows: List[List[Any]]
|
|
322
|
+
) -> int:
|
|
323
|
+
if len(next_rows) < 2:
|
|
324
|
+
return 0
|
|
325
|
+
|
|
326
|
+
col_types = []
|
|
327
|
+
for col_idx in range(len(current_row)):
|
|
328
|
+
if col_idx < len(current_row) and current_row[col_idx] is not None:
|
|
329
|
+
col_type_counter: Counter = Counter()
|
|
330
|
+
for row in next_rows:
|
|
331
|
+
if col_idx < len(row) and row[col_idx] is not None:
|
|
332
|
+
cell_type = type(row[col_idx]).__name__
|
|
333
|
+
col_type_counter[cell_type] += 1
|
|
334
|
+
|
|
335
|
+
if col_type_counter and col_type_counter.most_common(1)[0][1] >= 1:
|
|
336
|
+
col_types.append(col_type_counter.most_common(1)[0][0])
|
|
337
|
+
|
|
338
|
+
return 3 if len(col_types) >= 2 and len(set(col_types)) >= 1 else 0
|
|
339
|
+
|
|
340
|
+
@staticmethod
|
|
341
|
+
def _score_metadata_patterns(
|
|
342
|
+
row_idx: int, current_row: List[Any], all_rows: List[List[Any]]
|
|
343
|
+
) -> int:
|
|
344
|
+
score = 0
|
|
345
|
+
|
|
346
|
+
if row_idx == 0 and len(current_row) <= 2:
|
|
347
|
+
metadata_like = sum(
|
|
348
|
+
1
|
|
349
|
+
for cell in current_row
|
|
350
|
+
if cell is not None and isinstance(cell, str) and len(str(cell)) <= 20
|
|
351
|
+
)
|
|
352
|
+
if metadata_like <= 2:
|
|
353
|
+
score -= 1
|
|
354
|
+
|
|
355
|
+
if row_idx < len(all_rows) - 1 and len(current_row) >= 2:
|
|
356
|
+
if all(
|
|
357
|
+
isinstance(cell, str) for cell in current_row[:2] if cell is not None
|
|
358
|
+
):
|
|
359
|
+
score -= 2
|
|
360
|
+
|
|
361
|
+
return score
|
|
362
|
+
|
|
363
|
+
@staticmethod
|
|
364
|
+
def find_footer_start(rows: List[List[Any]], header_row_idx: int) -> int:
|
|
365
|
+
if header_row_idx + 1 >= len(rows):
|
|
366
|
+
return len(rows)
|
|
367
|
+
|
|
368
|
+
# Start with the assumption that all rows after the header are data (no footer)
|
|
369
|
+
footer_start_idx = len(rows)
|
|
370
|
+
|
|
371
|
+
# Get the number of columns in the header row to determine table width
|
|
372
|
+
header_row = rows[header_row_idx]
|
|
373
|
+
table_width = sum(
|
|
374
|
+
1 for cell in header_row if cell is not None and str(cell).strip() != ""
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Get a sample of data rows to establish patterns
|
|
378
|
+
data_sample_idx = min(header_row_idx + 5, len(rows) - 1)
|
|
379
|
+
data_rows = rows[header_row_idx + 1 : data_sample_idx + 1]
|
|
380
|
+
|
|
381
|
+
# Check for rows with significantly fewer populated cells than the data rows
|
|
382
|
+
avg_populated_cells = sum(
|
|
383
|
+
sum(1 for cell in row if cell is not None and str(cell).strip() != "")
|
|
384
|
+
for row in data_rows
|
|
385
|
+
) / len(data_rows)
|
|
386
|
+
|
|
387
|
+
# Look for pattern breaks, empty rows, or format changes
|
|
388
|
+
for i in range(header_row_idx + 1, len(rows)):
|
|
389
|
+
current_row = rows[i]
|
|
390
|
+
|
|
391
|
+
# Skip completely empty rows unless followed by non-data-like rows
|
|
392
|
+
if not any(
|
|
393
|
+
cell is not None and str(cell).strip() != "" for cell in current_row
|
|
394
|
+
):
|
|
395
|
+
# Look ahead to see if this empty row marks the start of footer
|
|
396
|
+
if i + 1 < len(rows):
|
|
397
|
+
next_row = rows[i + 1]
|
|
398
|
+
next_row_populated = sum(
|
|
399
|
+
1
|
|
400
|
+
for cell in next_row
|
|
401
|
+
if cell is not None and str(cell).strip() != ""
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# If the next row has significantly fewer populated cells or is text-heavy,
|
|
405
|
+
# consider this the start of footer
|
|
406
|
+
if (
|
|
407
|
+
next_row_populated < avg_populated_cells * 0.5
|
|
408
|
+
or sum(
|
|
409
|
+
1
|
|
410
|
+
for cell in next_row
|
|
411
|
+
if cell is not None
|
|
412
|
+
and isinstance(cell, str)
|
|
413
|
+
and len(str(cell)) > 20
|
|
414
|
+
)
|
|
415
|
+
> 0
|
|
416
|
+
):
|
|
417
|
+
footer_start_idx = i
|
|
418
|
+
break
|
|
419
|
+
continue
|
|
420
|
+
|
|
421
|
+
# Count populated cells
|
|
422
|
+
populated_cells = sum(
|
|
423
|
+
1
|
|
424
|
+
for cell in current_row
|
|
425
|
+
if cell is not None and str(cell).strip() != ""
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Check for footer indicators
|
|
429
|
+
footer_indicators = [
|
|
430
|
+
"total",
|
|
431
|
+
"sum",
|
|
432
|
+
"average",
|
|
433
|
+
"mean",
|
|
434
|
+
"source",
|
|
435
|
+
"note",
|
|
436
|
+
"footnote",
|
|
437
|
+
]
|
|
438
|
+
has_footer_text = any(
|
|
439
|
+
cell is not None
|
|
440
|
+
and isinstance(cell, str)
|
|
441
|
+
and any(
|
|
442
|
+
indicator in str(cell).lower() for indicator in footer_indicators
|
|
443
|
+
)
|
|
444
|
+
for cell in current_row
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Check for the summary row
|
|
448
|
+
looks_like_summary = has_footer_text and populated_cells <= table_width
|
|
449
|
+
|
|
450
|
+
# Check for notes or sources (often longer text spanning multiple columns)
|
|
451
|
+
long_text_cells = sum(
|
|
452
|
+
1
|
|
453
|
+
for cell in current_row
|
|
454
|
+
if cell is not None and isinstance(cell, str) and len(str(cell)) > 50
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
# If this looks like the start of the footer, mark it
|
|
458
|
+
if (
|
|
459
|
+
(populated_cells < avg_populated_cells * 0.7 and i > header_row_idx + 3)
|
|
460
|
+
or looks_like_summary
|
|
461
|
+
or long_text_cells > 0
|
|
462
|
+
):
|
|
463
|
+
footer_start_idx = i
|
|
464
|
+
break
|
|
465
|
+
|
|
466
|
+
# Check for inconsistent data types compared to data rows
|
|
467
|
+
if i > header_row_idx + 3:
|
|
468
|
+
data_type_mismatch = 0
|
|
469
|
+
for j, cell in enumerate(current_row):
|
|
470
|
+
if j < len(header_row) and header_row[j] is not None:
|
|
471
|
+
# Get the most common data type for this column in previous rows
|
|
472
|
+
col_types = [
|
|
473
|
+
type(rows[idx][j]).__name__
|
|
474
|
+
for idx in range(header_row_idx + 1, i)
|
|
475
|
+
if idx < len(rows)
|
|
476
|
+
and j < len(rows[idx])
|
|
477
|
+
and rows[idx][j] is not None
|
|
478
|
+
]
|
|
479
|
+
if col_types and cell is not None:
|
|
480
|
+
most_common_type = Counter(col_types).most_common(1)[0][0]
|
|
481
|
+
if type(cell).__name__ != most_common_type:
|
|
482
|
+
data_type_mismatch += 1
|
|
483
|
+
|
|
484
|
+
# If many columns have type mismatches, this might be a footer row
|
|
485
|
+
if data_type_mismatch > table_width * 0.5:
|
|
486
|
+
footer_start_idx = i
|
|
487
|
+
break
|
|
488
|
+
|
|
489
|
+
return footer_start_idx
|
|
490
|
+
|
|
491
|
+
@staticmethod
|
|
492
|
+
def extract_metadata(rows: List[List[Any]]) -> Dict[str, Any]:
|
|
493
|
+
metadata = {}
|
|
494
|
+
|
|
495
|
+
for row in rows:
|
|
496
|
+
if len(row) >= 2 and all(item is None for item in row[2:]):
|
|
497
|
+
key, value = row[:2]
|
|
498
|
+
if key is not None and value is not None:
|
|
499
|
+
metadata[str(key).strip().rstrip(":=").rstrip()] = str(
|
|
500
|
+
value
|
|
501
|
+
).strip()
|
|
502
|
+
|
|
503
|
+
return metadata
|
|
504
|
+
|
|
505
|
+
@staticmethod
|
|
506
|
+
def read_excel_properties(wb: Workbook) -> Dict[str, Any]:
|
|
507
|
+
# Core properties from DocumentProperties
|
|
508
|
+
core_props = wb.properties
|
|
509
|
+
properties = {
|
|
510
|
+
"title": core_props.title,
|
|
511
|
+
"author": core_props.creator,
|
|
512
|
+
"subject": core_props.subject,
|
|
513
|
+
"description": core_props.description,
|
|
514
|
+
"keywords": core_props.keywords,
|
|
515
|
+
"category": core_props.category,
|
|
516
|
+
"last_modified_by": core_props.lastModifiedBy,
|
|
517
|
+
"created": core_props.created,
|
|
518
|
+
"modified": core_props.modified,
|
|
519
|
+
"status": core_props.contentStatus,
|
|
520
|
+
"revision": core_props.revision,
|
|
521
|
+
"version": core_props.version,
|
|
522
|
+
"language": core_props.language,
|
|
523
|
+
"identifier": core_props.identifier,
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
# Remove None values
|
|
527
|
+
properties = {k: v for k, v in properties.items() if v is not None}
|
|
528
|
+
|
|
529
|
+
# Assign custom properties if they exist
|
|
530
|
+
if hasattr(wb, "custom_doc_props"):
|
|
531
|
+
for prop in wb.custom_doc_props.props:
|
|
532
|
+
if prop.value:
|
|
533
|
+
if prop.name in properties:
|
|
534
|
+
prop_name = f"custom.{prop.name}"
|
|
535
|
+
else:
|
|
536
|
+
prop_name = prop.name
|
|
537
|
+
properties[prop_name] = prop.value
|
|
538
|
+
|
|
539
|
+
return properties
|