content-extraction 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -269,7 +269,7 @@ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
269
269
  parser = HTMLSectionParser()
270
270
  parsed_sections = parser.parse_sections(html_content)
271
271
  parsed_sections_output_file = output_dir_path / 'parsed_sections.json'
272
- with open(parsed_sections_output_file) as f:
272
+ with open(parsed_sections_output_file, 'w') as f:
273
273
  f.write(json.dumps(parsed_sections))
274
274
 
275
275
  logger.info('[Processing File] Splitting parsed sections and creating JSON digest.')
@@ -10,7 +10,7 @@ class HTMLSectionParser:
10
10
  """Fast parser for HTML that finds sections and splits content into subsections."""
11
11
 
12
12
  def __init__(self):
13
- self.heading_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
13
+ self.heading_tags = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
14
14
 
15
15
  def get_heading_level(self, element) -> int | None:
16
16
  """Extract heading level from an element."""
@@ -19,23 +19,21 @@ class HTMLSectionParser:
19
19
  return int(element.name[1])
20
20
 
21
21
  # Elements with role="heading" and aria-level
22
- if element.get("role") == "heading":
23
- aria_level = element.get("aria-level")
22
+ if element.get('role') == 'heading':
23
+ aria_level = element.get('aria-level')
24
24
  if aria_level and aria_level.isdigit():
25
25
  return int(aria_level)
26
26
  # Default to level 1 if no aria-level specified
27
27
  return 1
28
28
 
29
29
  # Elements with aria-level (even without role="heading")
30
- aria_level = element.get("aria-level")
30
+ aria_level = element.get('aria-level')
31
31
  if aria_level and aria_level.isdigit():
32
32
  return int(aria_level)
33
33
 
34
34
  return None
35
35
 
36
- def extract_text_between_headings(
37
- self, soup, start_element, end_element=None
38
- ) -> str:
36
+ def extract_text_between_headings(self, soup, start_element, end_element=None) -> str:
39
37
  """Extract all content between two heading elements."""
40
38
  content_parts = []
41
39
  current = start_element.next_sibling
@@ -45,8 +43,8 @@ class HTMLSectionParser:
45
43
  # Check if this is a heading element
46
44
  if (
47
45
  current.name in self.heading_tags
48
- or (current.get("role") == "heading")
49
- or current.get("aria-level", "").isdigit()
46
+ or (current.get('role') == 'heading')
47
+ or current.get('aria-level', '').isdigit()
50
48
  ):
51
49
  # Hit another heading, stop
52
50
  break
@@ -59,14 +57,12 @@ class HTMLSectionParser:
59
57
  break
60
58
 
61
59
  content_parts.append(str(current))
62
- elif (
63
- hasattr(current, "string") and current.string and current.string.strip()
64
- ):
60
+ elif hasattr(current, 'string') and current.string and current.string.strip():
65
61
  # It's text content
66
62
  content_parts.append(current.string)
67
63
  current = current.next_sibling
68
64
 
69
- return "".join(content_parts).strip()
65
+ return ''.join(content_parts).strip()
70
66
 
71
67
  def _find_headings_in_element(self, element):
72
68
  """Find all heading elements within a given element."""
@@ -77,9 +73,7 @@ class HTMLSectionParser:
77
73
  headings.append((child, level))
78
74
  return headings
79
75
 
80
- def find_next_heading_at_level_or_higher(
81
- self, soup, start_element, current_level: int
82
- ):
76
+ def find_next_heading_at_level_or_higher(self, soup, start_element, current_level: int):
83
77
  """Find the next heading at the same level or higher."""
84
78
  current = start_element.next_sibling
85
79
 
@@ -94,7 +88,7 @@ class HTMLSectionParser:
94
88
 
95
89
  def parse_sections(self, html_content: str) -> list[dict[str, object]]:
96
90
  """Parse HTML and extract hierarchical sections."""
97
- soup = BeautifulSoup(html_content, "lxml")
91
+ soup = BeautifulSoup(html_content, 'lxml')
98
92
 
99
93
  # Find all potential heading elements in document order
100
94
  headings = []
@@ -141,19 +135,17 @@ class HTMLSectionParser:
141
135
  if j < len(headings):
142
136
  next_boundary = headings[j][0]
143
137
 
144
- text_content = self.extract_text_between_headings(
145
- soup, current_element, next_boundary
146
- )
138
+ text_content = self.extract_text_between_headings(soup, current_element, next_boundary)
147
139
 
148
140
  # Build subsections recursively
149
141
  subsections = self._build_hierarchy(soup, subsection_headings)
150
142
 
151
143
  # Build the section dictionary
152
144
  section = {
153
- "title": current_element.get_text().strip(),
154
- "text": text_content,
155
- "level": current_level,
156
- "subsections": subsections,
145
+ 'title': current_element.get_text().strip(),
146
+ 'text': text_content,
147
+ 'level': current_level,
148
+ 'subsections': subsections,
157
149
  }
158
150
 
159
151
  result.append(section)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content_extraction
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -1,17 +1,16 @@
1
1
  content_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  content_extraction/common_std_io.py,sha256=mSRaiI4OrnttEQ8Y92-LsJnAHEI3xLKnJvmXDHmkfWc,1547
3
3
  content_extraction/do_ocr.py,sha256=lrqwPYQlPuUHabirH_RzKbzHgYUPPpNeHDe_u4h9LEY,6886
4
- content_extraction/dspy_modules.py,sha256=0aAokJQNzczfowoUNK3BPMi_U18eXM9thHvciWaE5b0,732
5
4
  content_extraction/extract_from_pptx.py,sha256=IWd81sn7ZsyaQZdXP5Cgbk7GspcDYEjMnBkti-pTHQY,6572
6
- content_extraction/file_handlers.py,sha256=ppCi2A05Qns1I89jLu6gJyV2UidcY03DGjsZ8TkGXK8,11777
5
+ content_extraction/file_handlers.py,sha256=I15c2dINQudsRY3wXsv0pNeNsXc8fm5PIZ7GkY4DfrM,11782
7
6
  content_extraction/fix_ocr.py,sha256=2xJ4c3VsGSy1l-qAukvhaV8QOp6yu5BY99Gb0DwamWQ,8009
8
7
  content_extraction/logging_config.py,sha256=GN1wuJJEspQ3z-FZIg134obsHweuiicZfz2an13a9_I,296
9
8
  content_extraction/parse_html.py,sha256=mOrZKXX59YcdWWhmbnoTnfXpwrg0znk38x0DMJIVes8,3137
10
9
  content_extraction/process.py,sha256=iLcmSjWhEg_DbgnftnVIfybIeLCuTEI57gasot0MtDk,1809
11
10
  content_extraction/process_document.sh,sha256=QbQOrV7isiEyxin1PBNGYmCbfVQ_eW-JgsbuQV4VB2o,1106
12
- content_extraction/semantic_chunk_html.py,sha256=iJPspKkrt95lL46JpC_9fgT8GfV8cz04TWEnU99rbBw,5786
11
+ content_extraction/semantic_chunk_html.py,sha256=PpK2W2Fse5-SU7hBqE-JWEW_sT3cEPaKNRRD2orEs-k,5696
13
12
  content_extraction/split_and_create_digest.py,sha256=vW4lyeTlRzZcqJS15g8Xqq5IZB06unrUBnQV7RrFDmA,4342
14
- content_extraction-0.4.1.dist-info/METADATA,sha256=8V8OnWOkS4Nie974-BPfMfWTKH-CmTYo72yUmIjdJE8,6266
15
- content_extraction-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
16
- content_extraction-0.4.1.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
17
- content_extraction-0.4.1.dist-info/RECORD,,
13
+ content_extraction-0.4.2.dist-info/METADATA,sha256=glN_ZSgjFvvwhIp0X49yKSxJ1Av1EqjTZONDu_FMHOo,6266
14
+ content_extraction-0.4.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ content_extraction-0.4.2.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
16
+ content_extraction-0.4.2.dist-info/RECORD,,
@@ -1,24 +0,0 @@
1
- import dspy
2
-
3
- lm = dspy.LM("openai/gpt-4o-mini", temperature=0.3, max_tokens=5000)
4
- dspy.configure(lm=lm)
5
-
6
-
7
- class CorrectHeadingLevelSignature(dspy.Signature):
8
- """Correct heading levels. Main title should be H1, Chapter Titles H2, etc."""
9
-
10
- headings: str = dspy.InputField(
11
- description=r"String of headings extracted via OCR process, separated by \n"
12
- )
13
- corrected_headings: str = dspy.OutputField(
14
- description="Headings with corrected level"
15
- )
16
-
17
-
18
- class CorrectHeadingLevel(dspy.Module):
19
- def __init__(self):
20
- self.predictor = dspy.ChainOfThought(CorrectHeadingLevelSignature)
21
-
22
- def forward(self, headings):
23
- prediction = self.predictor(headings=headings)
24
- return prediction