content-extraction 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- content_extraction/file_handlers.py +1 -1
- content_extraction/semantic_chunk_html.py +16 -24
- {content_extraction-0.4.0.dist-info → content_extraction-0.4.2.dist-info}/METADATA +1 -2
- {content_extraction-0.4.0.dist-info → content_extraction-0.4.2.dist-info}/RECORD +6 -7
- content_extraction/dspy_modules.py +0 -24
- {content_extraction-0.4.0.dist-info → content_extraction-0.4.2.dist-info}/WHEEL +0 -0
- {content_extraction-0.4.0.dist-info → content_extraction-0.4.2.dist-info}/top_level.txt +0 -0
@@ -269,7 +269,7 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
|
|
269
269
|
parser = HTMLSectionParser()
|
270
270
|
parsed_sections = parser.parse_sections(html_content)
|
271
271
|
parsed_sections_output_file = output_dir_path / 'parsed_sections.json'
|
272
|
-
with open(parsed_sections_output_file) as f:
|
272
|
+
with open(parsed_sections_output_file, 'w') as f:
|
273
273
|
f.write(json.dumps(parsed_sections))
|
274
274
|
|
275
275
|
logger.info('[Processing File] Splitting parsed sections and creating JSON digest.')
|
@@ -10,7 +10,7 @@ class HTMLSectionParser:
|
|
10
10
|
"""Fast parser for HTML that finds sections and splits content into subsections."""
|
11
11
|
|
12
12
|
def __init__(self):
|
13
|
-
self.heading_tags = {
|
13
|
+
self.heading_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
|
14
14
|
|
15
15
|
def get_heading_level(self, element) -> int | None:
|
16
16
|
"""Extract heading level from an element."""
|
@@ -19,23 +19,21 @@ class HTMLSectionParser:
|
|
19
19
|
return int(element.name[1])
|
20
20
|
|
21
21
|
# Elements with role="heading" and aria-level
|
22
|
-
if element.get(
|
23
|
-
aria_level = element.get(
|
22
|
+
if element.get('role') == 'heading':
|
23
|
+
aria_level = element.get('aria-level')
|
24
24
|
if aria_level and aria_level.isdigit():
|
25
25
|
return int(aria_level)
|
26
26
|
# Default to level 1 if no aria-level specified
|
27
27
|
return 1
|
28
28
|
|
29
29
|
# Elements with aria-level (even without role="heading")
|
30
|
-
aria_level = element.get(
|
30
|
+
aria_level = element.get('aria-level')
|
31
31
|
if aria_level and aria_level.isdigit():
|
32
32
|
return int(aria_level)
|
33
33
|
|
34
34
|
return None
|
35
35
|
|
36
|
-
def extract_text_between_headings(
|
37
|
-
self, soup, start_element, end_element=None
|
38
|
-
) -> str:
|
36
|
+
def extract_text_between_headings(self, soup, start_element, end_element=None) -> str:
|
39
37
|
"""Extract all content between two heading elements."""
|
40
38
|
content_parts = []
|
41
39
|
current = start_element.next_sibling
|
@@ -45,8 +43,8 @@ class HTMLSectionParser:
|
|
45
43
|
# Check if this is a heading element
|
46
44
|
if (
|
47
45
|
current.name in self.heading_tags
|
48
|
-
or (current.get(
|
49
|
-
or current.get(
|
46
|
+
or (current.get('role') == 'heading')
|
47
|
+
or current.get('aria-level', '').isdigit()
|
50
48
|
):
|
51
49
|
# Hit another heading, stop
|
52
50
|
break
|
@@ -59,14 +57,12 @@ class HTMLSectionParser:
|
|
59
57
|
break
|
60
58
|
|
61
59
|
content_parts.append(str(current))
|
62
|
-
elif (
|
63
|
-
hasattr(current, "string") and current.string and current.string.strip()
|
64
|
-
):
|
60
|
+
elif hasattr(current, 'string') and current.string and current.string.strip():
|
65
61
|
# It's text content
|
66
62
|
content_parts.append(current.string)
|
67
63
|
current = current.next_sibling
|
68
64
|
|
69
|
-
return
|
65
|
+
return ''.join(content_parts).strip()
|
70
66
|
|
71
67
|
def _find_headings_in_element(self, element):
|
72
68
|
"""Find all heading elements within a given element."""
|
@@ -77,9 +73,7 @@ class HTMLSectionParser:
|
|
77
73
|
headings.append((child, level))
|
78
74
|
return headings
|
79
75
|
|
80
|
-
def find_next_heading_at_level_or_higher(
|
81
|
-
self, soup, start_element, current_level: int
|
82
|
-
):
|
76
|
+
def find_next_heading_at_level_or_higher(self, soup, start_element, current_level: int):
|
83
77
|
"""Find the next heading at the same level or higher."""
|
84
78
|
current = start_element.next_sibling
|
85
79
|
|
@@ -94,7 +88,7 @@ class HTMLSectionParser:
|
|
94
88
|
|
95
89
|
def parse_sections(self, html_content: str) -> list[dict[str, object]]:
|
96
90
|
"""Parse HTML and extract hierarchical sections."""
|
97
|
-
soup = BeautifulSoup(html_content,
|
91
|
+
soup = BeautifulSoup(html_content, 'lxml')
|
98
92
|
|
99
93
|
# Find all potential heading elements in document order
|
100
94
|
headings = []
|
@@ -141,19 +135,17 @@ class HTMLSectionParser:
|
|
141
135
|
if j < len(headings):
|
142
136
|
next_boundary = headings[j][0]
|
143
137
|
|
144
|
-
text_content = self.extract_text_between_headings(
|
145
|
-
soup, current_element, next_boundary
|
146
|
-
)
|
138
|
+
text_content = self.extract_text_between_headings(soup, current_element, next_boundary)
|
147
139
|
|
148
140
|
# Build subsections recursively
|
149
141
|
subsections = self._build_hierarchy(soup, subsection_headings)
|
150
142
|
|
151
143
|
# Build the section dictionary
|
152
144
|
section = {
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
145
|
+
'title': current_element.get_text().strip(),
|
146
|
+
'text': text_content,
|
147
|
+
'level': current_level,
|
148
|
+
'subsections': subsections,
|
157
149
|
}
|
158
150
|
|
159
151
|
result.append(section)
|
@@ -1,11 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: content_extraction
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.2
|
4
4
|
Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Description-Content-Type: text/markdown
|
7
7
|
Requires-Dist: beautifulsoup4>=4.13.4
|
8
|
-
Requires-Dist: dspy>=2.6.27
|
9
8
|
Requires-Dist: langdetect>=1.0.9
|
10
9
|
Requires-Dist: lxml>=6.0.0
|
11
10
|
Requires-Dist: python-pptx>=1.0.2
|
@@ -1,17 +1,16 @@
|
|
1
1
|
content_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
content_extraction/common_std_io.py,sha256=mSRaiI4OrnttEQ8Y92-LsJnAHEI3xLKnJvmXDHmkfWc,1547
|
3
3
|
content_extraction/do_ocr.py,sha256=lrqwPYQlPuUHabirH_RzKbzHgYUPPpNeHDe_u4h9LEY,6886
|
4
|
-
content_extraction/dspy_modules.py,sha256=0aAokJQNzczfowoUNK3BPMi_U18eXM9thHvciWaE5b0,732
|
5
4
|
content_extraction/extract_from_pptx.py,sha256=IWd81sn7ZsyaQZdXP5Cgbk7GspcDYEjMnBkti-pTHQY,6572
|
6
|
-
content_extraction/file_handlers.py,sha256=
|
5
|
+
content_extraction/file_handlers.py,sha256=I15c2dINQudsRY3wXsv0pNeNsXc8fm5PIZ7GkY4DfrM,11782
|
7
6
|
content_extraction/fix_ocr.py,sha256=2xJ4c3VsGSy1l-qAukvhaV8QOp6yu5BY99Gb0DwamWQ,8009
|
8
7
|
content_extraction/logging_config.py,sha256=GN1wuJJEspQ3z-FZIg134obsHweuiicZfz2an13a9_I,296
|
9
8
|
content_extraction/parse_html.py,sha256=mOrZKXX59YcdWWhmbnoTnfXpwrg0znk38x0DMJIVes8,3137
|
10
9
|
content_extraction/process.py,sha256=iLcmSjWhEg_DbgnftnVIfybIeLCuTEI57gasot0MtDk,1809
|
11
10
|
content_extraction/process_document.sh,sha256=QbQOrV7isiEyxin1PBNGYmCbfVQ_eW-JgsbuQV4VB2o,1106
|
12
|
-
content_extraction/semantic_chunk_html.py,sha256=
|
11
|
+
content_extraction/semantic_chunk_html.py,sha256=PpK2W2Fse5-SU7hBqE-JWEW_sT3cEPaKNRRD2orEs-k,5696
|
13
12
|
content_extraction/split_and_create_digest.py,sha256=vW4lyeTlRzZcqJS15g8Xqq5IZB06unrUBnQV7RrFDmA,4342
|
14
|
-
content_extraction-0.4.
|
15
|
-
content_extraction-0.4.
|
16
|
-
content_extraction-0.4.
|
17
|
-
content_extraction-0.4.
|
13
|
+
content_extraction-0.4.2.dist-info/METADATA,sha256=glN_ZSgjFvvwhIp0X49yKSxJ1Av1EqjTZONDu_FMHOo,6266
|
14
|
+
content_extraction-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
15
|
+
content_extraction-0.4.2.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
|
16
|
+
content_extraction-0.4.2.dist-info/RECORD,,
|
@@ -1,24 +0,0 @@
|
|
1
|
-
import dspy
|
2
|
-
|
3
|
-
lm = dspy.LM("openai/gpt-4o-mini", temperature=0.3, max_tokens=5000)
|
4
|
-
dspy.configure(lm=lm)
|
5
|
-
|
6
|
-
|
7
|
-
class CorrectHeadingLevelSignature(dspy.Signature):
|
8
|
-
"""Correct heading levels. Main title should be H1, Chapter Titles H2, etc."""
|
9
|
-
|
10
|
-
headings: str = dspy.InputField(
|
11
|
-
description=r"String of headings extracted via OCR process, separated by \n"
|
12
|
-
)
|
13
|
-
corrected_headings: str = dspy.OutputField(
|
14
|
-
description="Headings with corrected level"
|
15
|
-
)
|
16
|
-
|
17
|
-
|
18
|
-
class CorrectHeadingLevel(dspy.Module):
|
19
|
-
def __init__(self):
|
20
|
-
self.predictor = dspy.ChainOfThought(CorrectHeadingLevelSignature)
|
21
|
-
|
22
|
-
def forward(self, headings):
|
23
|
-
prediction = self.predictor(headings=headings)
|
24
|
-
return prediction
|
File without changes
|
File without changes
|