html-table-parse 0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.3
2
+ Name: html-table-parse
3
+ Version: 0.2
4
+ Summary: Parse HTML table as Python list or dict
5
+ Author: 5j9
6
+ Author-email: 5j9 <5j9@users.noreply.github.com>
7
+ License: GPL-3.0
8
+ Requires-Dist: lxml>=6.1.1
9
+ Requires-Python: >=3.14
10
+ Project-URL: Homepage, https://github.com/5j9/html-table-parse
11
+ Description-Content-Type: text/x-rst
12
+
13
+ Parse HTML table as Python list or dict
14
+
15
+ Requires Python 3.14+
@@ -0,0 +1,3 @@
1
+ Parse HTML table as Python list or dict
2
+
3
+ Requires Python 3.14+
@@ -0,0 +1,170 @@
1
+ __version__ = '0.2'
2
+
3
+ from collections import defaultdict as _defaultdict
4
+ from functools import partial as _partial
5
+ from re import compile as _rc
6
+
7
+ from lxml import html as _html
8
+
9
+ normalize_spaces = _partial(_rc(r'\s+').sub, ' ')
10
+
11
+
12
+ def to_list(html_str: str, index: int = 0) -> list[list]:
13
+ """Parse HTML table and return as list of lists (rows)."""
14
+ tree = _html.fromstring(html_str)
15
+ tables = tree.xpath('//table')
16
+
17
+ if not tables:
18
+ raise ValueError('No tables found in HTML')
19
+
20
+ if index >= len(tables):
21
+ raise ValueError(
22
+ f'Table index {index} out of range (found {len(tables)} tables)'
23
+ )
24
+
25
+ table = tables[index]
26
+ return _parse_table(table)
27
+
28
+
29
+ def _parse_table(table_element) -> list[list]:
30
+ """Parse a single table element into rows of data with colspan/rowspan support."""
31
+ rows = table_element.xpath('.//tr')
32
+
33
+ if not rows:
34
+ return []
35
+
36
+ # Determine if first row is a header (contains any th)
37
+ first_row_cells = rows[0].xpath('.//td | .//th')
38
+ has_header = any(cell.tag == 'th' for cell in first_row_cells)
39
+
40
+ # Parse all rows with colspan/rowspan support
41
+ parsed_rows = []
42
+ rowspan_map = {} # (col_index) -> remaining_rowspan
43
+
44
+ for row_idx, row in enumerate(rows):
45
+ cells = row.xpath('.//td | .//th')
46
+ row_data = []
47
+ col_idx = 0
48
+
49
+ # Handle active rowspans from previous rows
50
+ while col_idx in rowspan_map and rowspan_map[col_idx] > 0:
51
+ row_data.append('')
52
+ rowspan_map[col_idx] -= 1
53
+ if rowspan_map[col_idx] == 0:
54
+ del rowspan_map[col_idx]
55
+ col_idx += 1
56
+
57
+ for cell in cells:
58
+ # Skip to next available column if rowspan is active
59
+ while col_idx in rowspan_map and rowspan_map[col_idx] > 0:
60
+ row_data.append('')
61
+ rowspan_map[col_idx] -= 1
62
+ if rowspan_map[col_idx] == 0:
63
+ del rowspan_map[col_idx]
64
+ col_idx += 1
65
+
66
+ # Get cell properties
67
+ text = _clean_text(cell)
68
+ colspan = int(cell.get('colspan', 1))
69
+ rowspan = int(cell.get('rowspan', 1))
70
+
71
+ # Add the cell text (colspan times)
72
+ for _ in range(colspan):
73
+ row_data.append(text)
74
+
75
+ # Handle rowspan
76
+ if rowspan > 1:
77
+ for i in range(colspan):
78
+ rowspan_map[col_idx + i] = rowspan - 1
79
+
80
+ col_idx += colspan
81
+
82
+ # Check if this row should be a header
83
+ is_header_row = row_idx == 0 and has_header
84
+
85
+ # Only add non-empty rows (or header rows)
86
+ if any(row_data) or is_header_row:
87
+ parsed_rows.append(row_data)
88
+
89
+ return parsed_rows
90
+
91
+
92
+ def _clean_text(element) -> str:
93
+ """Extract and clean text from an HTML element."""
94
+ if element is None:
95
+ return ''
96
+ text = element.text_content()
97
+ text = normalize_spaces(text).strip()
98
+ return text
99
+
100
+
101
+ def to_dict(html: str, index: int = 0) -> dict[str, list]:
102
+ """
103
+ Uses first row as headers.
104
+ Returns a dictionary with column names as keys and lists of values as values.
105
+ Handles duplicate headers by appending numbers.
106
+ """
107
+ rows = to_list(html, index)
108
+
109
+ if not rows:
110
+ return {}
111
+
112
+ headers = rows[0]
113
+ data_rows = rows[1:]
114
+
115
+ # Handle duplicate headers by making them unique
116
+ unique_headers = []
117
+ header_count = _defaultdict(int)
118
+
119
+ for header in headers:
120
+ if header_count[header] > 0:
121
+ unique_header = f'{header}_{header_count[header] + 1}'
122
+ else:
123
+ unique_header = header
124
+ unique_headers.append(unique_header)
125
+ header_count[header] += 1
126
+
127
+ result = {header: [] for header in unique_headers}
128
+
129
+ for row in data_rows:
130
+ padded_row = row + [''] * (len(unique_headers) - len(row))
131
+ padded_row = padded_row[: len(unique_headers)]
132
+
133
+ for header, value in zip(unique_headers, padded_row):
134
+ result[header].append(value)
135
+
136
+ return result
137
+
138
+
139
+ def to_dicts(html: str, index: int = 0) -> list[dict]:
140
+ """
141
+ Returns a list of dicts, one per data row, using first row as headers.
142
+ Handles duplicate headers by numbering them.
143
+ """
144
+ rows = to_list(html, index)
145
+
146
+ if not rows:
147
+ return []
148
+
149
+ headers = rows[0]
150
+ data_rows = rows[1:]
151
+
152
+ # Make headers unique
153
+ unique_headers = []
154
+ header_count = _defaultdict(int)
155
+
156
+ for header in headers:
157
+ if header_count[header] > 0:
158
+ unique_header = f'{header}_{header_count[header] + 1}'
159
+ else:
160
+ unique_header = header
161
+ unique_headers.append(unique_header)
162
+ header_count[header] += 1
163
+
164
+ result = []
165
+ for row in data_rows:
166
+ padded_row = row + [''] * (len(unique_headers) - len(row))
167
+ padded_row = padded_row[: len(unique_headers)]
168
+ result.append(dict(zip(unique_headers, padded_row)))
169
+
170
+ return result
@@ -0,0 +1,66 @@
1
+ [build-system]
2
+ requires = ['uv_build>=0.8.3,<0.9.0']
3
+ build-backend = 'uv_build'
4
+
5
+ [project]
6
+ name = 'html-table-parse'
7
+ version = "0.2"
8
+ authors = [{ name = '5j9', email = '5j9@users.noreply.github.com' }]
9
+ description = 'Parse HTML table as Python list or dict'
10
+ readme = 'README.rst'
11
+ requires-python = '>=3.14'
12
+ dependencies = [
13
+ "lxml>=6.1.1",
14
+ ]
15
+ license = { text = 'GPL-3.0' }
16
+
17
+ [project.urls]
18
+ Homepage = 'https://github.com/5j9/html-table-parse'
19
+
20
+
21
+ [tool.uv.build-backend]
22
+ module-root = ''
23
+ module-name = "html_table_parse"
24
+
25
+
26
+ [tool.ruff]
27
+ line-length = 79
28
+ format.quote-style = 'single'
29
+ lint.isort.combine-as-imports = true
30
+ lint.extend-select = [
31
+ 'W605', # invalid-escape-sequence
32
+ 'FA', # flake8-future-annotations
33
+ 'I', # isort
34
+ 'UP', # pyupgrade
35
+ 'RUF', # Ruff-specific rules (RUF)
36
+ ]
37
+ lint.ignore = [
38
+ 'E721', # Do not compare types, use `isinstance()`
39
+ 'RUF001', # ambiguous-unicode-character-string
40
+ 'RUF002', # ambiguous-unicode-character-docstring
41
+ 'RUF003', # ambiguous-unicode-character-comment
42
+ 'RUF012', # mutable-class-default
43
+ 'RUF059', # Unpacked variable never used
44
+ ]
45
+
46
+ [tool.pytest.ini_options]
47
+ addopts = '--quiet --tb=short'
48
+ [tool.pyright]
49
+ typeCheckingMode = 'standard'
50
+ reportInvalidStringEscapeSequence = false
51
+ reportConstantRedefinition = 'error'
52
+ reportDeprecated = 'warning'
53
+ reportPropertyTypeMismatch = 'error'
54
+ reportTypeCommentUsage = 'warning'
55
+ reportUnnecessaryCast = 'warning'
56
+ reportUnnecessaryComparison = 'warning'
57
+ reportUnnecessaryContains = 'warning'
58
+ reportUnnecessaryIsInstance = 'warning'
59
+ reportUnnecessaryTypeIgnoreComment = 'warning'
60
+ venvPath = "."
61
+ venv = ".venv"
62
+
63
+ [dependency-groups]
64
+ dev = [
65
+ "pytest>=9.1.1",
66
+ ]