html-table-parse 0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: html-table-parse
|
|
3
|
+
Version: 0.2
|
|
4
|
+
Summary: Parse HTML table as Python list or dict
|
|
5
|
+
Author: 5j9
|
|
6
|
+
Author-email: 5j9 <5j9@users.noreply.github.com>
|
|
7
|
+
License: GPL-3.0
|
|
8
|
+
Requires-Dist: lxml>=6.1.1
|
|
9
|
+
Requires-Python: >=3.14
|
|
10
|
+
Project-URL: Homepage, https://github.com/5j9/html-table-parse
|
|
11
|
+
Description-Content-Type: text/x-rst
|
|
12
|
+
|
|
13
|
+
Parse HTML table as Python list or dict
|
|
14
|
+
|
|
15
|
+
Requires Python 3.14+
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
__version__ = '0.2'
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict as _defaultdict
|
|
4
|
+
from functools import partial as _partial
|
|
5
|
+
from re import compile as _rc
|
|
6
|
+
|
|
7
|
+
from lxml import html as _html
|
|
8
|
+
|
|
9
|
+
normalize_spaces = _partial(_rc(r'\s+').sub, ' ')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def to_list(html_str: str, index: int = 0) -> list[list]:
|
|
13
|
+
"""Parse HTML table and return as list of lists (rows)."""
|
|
14
|
+
tree = _html.fromstring(html_str)
|
|
15
|
+
tables = tree.xpath('//table')
|
|
16
|
+
|
|
17
|
+
if not tables:
|
|
18
|
+
raise ValueError('No tables found in HTML')
|
|
19
|
+
|
|
20
|
+
if index >= len(tables):
|
|
21
|
+
raise ValueError(
|
|
22
|
+
f'Table index {index} out of range (found {len(tables)} tables)'
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
table = tables[index]
|
|
26
|
+
return _parse_table(table)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _parse_table(table_element) -> list[list]:
|
|
30
|
+
"""Parse a single table element into rows of data with colspan/rowspan support."""
|
|
31
|
+
rows = table_element.xpath('.//tr')
|
|
32
|
+
|
|
33
|
+
if not rows:
|
|
34
|
+
return []
|
|
35
|
+
|
|
36
|
+
# Determine if first row is a header (contains any th)
|
|
37
|
+
first_row_cells = rows[0].xpath('.//td | .//th')
|
|
38
|
+
has_header = any(cell.tag == 'th' for cell in first_row_cells)
|
|
39
|
+
|
|
40
|
+
# Parse all rows with colspan/rowspan support
|
|
41
|
+
parsed_rows = []
|
|
42
|
+
rowspan_map = {} # (col_index) -> remaining_rowspan
|
|
43
|
+
|
|
44
|
+
for row_idx, row in enumerate(rows):
|
|
45
|
+
cells = row.xpath('.//td | .//th')
|
|
46
|
+
row_data = []
|
|
47
|
+
col_idx = 0
|
|
48
|
+
|
|
49
|
+
# Handle active rowspans from previous rows
|
|
50
|
+
while col_idx in rowspan_map and rowspan_map[col_idx] > 0:
|
|
51
|
+
row_data.append('')
|
|
52
|
+
rowspan_map[col_idx] -= 1
|
|
53
|
+
if rowspan_map[col_idx] == 0:
|
|
54
|
+
del rowspan_map[col_idx]
|
|
55
|
+
col_idx += 1
|
|
56
|
+
|
|
57
|
+
for cell in cells:
|
|
58
|
+
# Skip to next available column if rowspan is active
|
|
59
|
+
while col_idx in rowspan_map and rowspan_map[col_idx] > 0:
|
|
60
|
+
row_data.append('')
|
|
61
|
+
rowspan_map[col_idx] -= 1
|
|
62
|
+
if rowspan_map[col_idx] == 0:
|
|
63
|
+
del rowspan_map[col_idx]
|
|
64
|
+
col_idx += 1
|
|
65
|
+
|
|
66
|
+
# Get cell properties
|
|
67
|
+
text = _clean_text(cell)
|
|
68
|
+
colspan = int(cell.get('colspan', 1))
|
|
69
|
+
rowspan = int(cell.get('rowspan', 1))
|
|
70
|
+
|
|
71
|
+
# Add the cell text (colspan times)
|
|
72
|
+
for _ in range(colspan):
|
|
73
|
+
row_data.append(text)
|
|
74
|
+
|
|
75
|
+
# Handle rowspan
|
|
76
|
+
if rowspan > 1:
|
|
77
|
+
for i in range(colspan):
|
|
78
|
+
rowspan_map[col_idx + i] = rowspan - 1
|
|
79
|
+
|
|
80
|
+
col_idx += colspan
|
|
81
|
+
|
|
82
|
+
# Check if this row should be a header
|
|
83
|
+
is_header_row = row_idx == 0 and has_header
|
|
84
|
+
|
|
85
|
+
# Only add non-empty rows (or header rows)
|
|
86
|
+
if any(row_data) or is_header_row:
|
|
87
|
+
parsed_rows.append(row_data)
|
|
88
|
+
|
|
89
|
+
return parsed_rows
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _clean_text(element) -> str:
|
|
93
|
+
"""Extract and clean text from an HTML element."""
|
|
94
|
+
if element is None:
|
|
95
|
+
return ''
|
|
96
|
+
text = element.text_content()
|
|
97
|
+
text = normalize_spaces(text).strip()
|
|
98
|
+
return text
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def to_dict(html: str, index: int = 0) -> dict[str, list]:
|
|
102
|
+
"""
|
|
103
|
+
Uses first row as headers.
|
|
104
|
+
Returns a dictionary with column names as keys and lists of values as values.
|
|
105
|
+
Handles duplicate headers by appending numbers.
|
|
106
|
+
"""
|
|
107
|
+
rows = to_list(html, index)
|
|
108
|
+
|
|
109
|
+
if not rows:
|
|
110
|
+
return {}
|
|
111
|
+
|
|
112
|
+
headers = rows[0]
|
|
113
|
+
data_rows = rows[1:]
|
|
114
|
+
|
|
115
|
+
# Handle duplicate headers by making them unique
|
|
116
|
+
unique_headers = []
|
|
117
|
+
header_count = _defaultdict(int)
|
|
118
|
+
|
|
119
|
+
for header in headers:
|
|
120
|
+
if header_count[header] > 0:
|
|
121
|
+
unique_header = f'{header}_{header_count[header] + 1}'
|
|
122
|
+
else:
|
|
123
|
+
unique_header = header
|
|
124
|
+
unique_headers.append(unique_header)
|
|
125
|
+
header_count[header] += 1
|
|
126
|
+
|
|
127
|
+
result = {header: [] for header in unique_headers}
|
|
128
|
+
|
|
129
|
+
for row in data_rows:
|
|
130
|
+
padded_row = row + [''] * (len(unique_headers) - len(row))
|
|
131
|
+
padded_row = padded_row[: len(unique_headers)]
|
|
132
|
+
|
|
133
|
+
for header, value in zip(unique_headers, padded_row):
|
|
134
|
+
result[header].append(value)
|
|
135
|
+
|
|
136
|
+
return result
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def to_dicts(html: str, index: int = 0) -> list[dict]:
|
|
140
|
+
"""
|
|
141
|
+
Returns a list of dicts, one per data row, using first row as headers.
|
|
142
|
+
Handles duplicate headers by numbering them.
|
|
143
|
+
"""
|
|
144
|
+
rows = to_list(html, index)
|
|
145
|
+
|
|
146
|
+
if not rows:
|
|
147
|
+
return []
|
|
148
|
+
|
|
149
|
+
headers = rows[0]
|
|
150
|
+
data_rows = rows[1:]
|
|
151
|
+
|
|
152
|
+
# Make headers unique
|
|
153
|
+
unique_headers = []
|
|
154
|
+
header_count = _defaultdict(int)
|
|
155
|
+
|
|
156
|
+
for header in headers:
|
|
157
|
+
if header_count[header] > 0:
|
|
158
|
+
unique_header = f'{header}_{header_count[header] + 1}'
|
|
159
|
+
else:
|
|
160
|
+
unique_header = header
|
|
161
|
+
unique_headers.append(unique_header)
|
|
162
|
+
header_count[header] += 1
|
|
163
|
+
|
|
164
|
+
result = []
|
|
165
|
+
for row in data_rows:
|
|
166
|
+
padded_row = row + [''] * (len(unique_headers) - len(row))
|
|
167
|
+
padded_row = padded_row[: len(unique_headers)]
|
|
168
|
+
result.append(dict(zip(unique_headers, padded_row)))
|
|
169
|
+
|
|
170
|
+
return result
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ['uv_build>=0.8.3,<0.9.0']
|
|
3
|
+
build-backend = 'uv_build'
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = 'html-table-parse'
|
|
7
|
+
version = "0.2"
|
|
8
|
+
authors = [{ name = '5j9', email = '5j9@users.noreply.github.com' }]
|
|
9
|
+
description = 'Parse HTML table as Python list or dict'
|
|
10
|
+
readme = 'README.rst'
|
|
11
|
+
requires-python = '>=3.14'
|
|
12
|
+
dependencies = [
|
|
13
|
+
"lxml>=6.1.1",
|
|
14
|
+
]
|
|
15
|
+
license = { text = 'GPL-3.0' }
|
|
16
|
+
|
|
17
|
+
[project.urls]
|
|
18
|
+
Homepage = 'https://github.com/5j9/html-table-parse'
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
[tool.uv.build-backend]
|
|
22
|
+
module-root = ''
|
|
23
|
+
module-name = "html_table_parse"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
[tool.ruff]
|
|
27
|
+
line-length = 79
|
|
28
|
+
format.quote-style = 'single'
|
|
29
|
+
lint.isort.combine-as-imports = true
|
|
30
|
+
lint.extend-select = [
|
|
31
|
+
'W605', # invalid-escape-sequence
|
|
32
|
+
'FA', # flake8-future-annotations
|
|
33
|
+
'I', # isort
|
|
34
|
+
'UP', # pyupgrade
|
|
35
|
+
'RUF', # Ruff-specific rules (RUF)
|
|
36
|
+
]
|
|
37
|
+
lint.ignore = [
|
|
38
|
+
'E721', # Do not compare types, use `isinstance()`
|
|
39
|
+
'RUF001', # ambiguous-unicode-character-string
|
|
40
|
+
'RUF002', # ambiguous-unicode-character-docstring
|
|
41
|
+
'RUF003', # ambiguous-unicode-character-comment
|
|
42
|
+
'RUF012', # mutable-class-default
|
|
43
|
+
'RUF059', # Unpacked variable never used
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
[tool.pytest.ini_options]
|
|
47
|
+
addopts = '--quiet --tb=short'
|
|
48
|
+
[tool.pyright]
|
|
49
|
+
typeCheckingMode = 'standard'
|
|
50
|
+
reportInvalidStringEscapeSequence = false
|
|
51
|
+
reportConstantRedefinition = 'error'
|
|
52
|
+
reportDeprecated = 'warning'
|
|
53
|
+
reportPropertyTypeMismatch = 'error'
|
|
54
|
+
reportTypeCommentUsage = 'warning'
|
|
55
|
+
reportUnnecessaryCast = 'warning'
|
|
56
|
+
reportUnnecessaryComparison = 'warning'
|
|
57
|
+
reportUnnecessaryContains = 'warning'
|
|
58
|
+
reportUnnecessaryIsInstance = 'warning'
|
|
59
|
+
reportUnnecessaryTypeIgnoreComment = 'warning'
|
|
60
|
+
venvPath = "."
|
|
61
|
+
venv = ".venv"
|
|
62
|
+
|
|
63
|
+
[dependency-groups]
|
|
64
|
+
dev = [
|
|
65
|
+
"pytest>=9.1.1",
|
|
66
|
+
]
|