natural-pdf 0.2.19__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -943,7 +943,7 @@ class GuidesList(UserList):
943
943
 
944
944
  def from_headers(
945
945
  self,
946
- headers: Union["ElementCollection", List["Element"]],
946
+ headers: Union["ElementCollection", List["Element"], List[str]],
947
947
  obj: Optional[Union["Page", "Region"]] = None,
948
948
  method: Literal["min_crossings", "seam_carving"] = "min_crossings",
949
949
  min_width: Optional[float] = None,
@@ -960,7 +960,10 @@ class GuidesList(UserList):
960
960
  between headers that minimize text crossings, regardless of text alignment.
961
961
 
962
962
  Args:
963
- headers: Column header elements (ElementCollection or list of Elements)
963
+ headers: Column header elements. Can be:
964
+ - ElementCollection: collection of header elements
965
+ - List[Element]: list of header elements
966
+ - List[str]: list of header text to search for
964
967
  obj: Page/Region to analyze (uses parent's context if None)
965
968
  method: Detection method:
966
969
  - 'min_crossings': Fast vector-based minimum intersection count
@@ -980,6 +983,9 @@ class GuidesList(UserList):
980
983
  headers = page.find_all('text[size=16]')
981
984
  guides.vertical.from_headers(headers)
982
985
 
986
+ # From header text strings
987
+ guides.vertical.from_headers(["Statute", "Description", "Level", "Repeat"])
988
+
983
989
  # With width constraints
984
990
  guides.vertical.from_headers(headers, min_width=50, max_width=200)
985
991
 
@@ -997,6 +1003,24 @@ class GuidesList(UserList):
997
1003
  # Convert headers to list if ElementCollection
998
1004
  if hasattr(headers, "elements"):
999
1005
  header_elements = list(headers.elements)
1006
+ # Check if headers is a list of strings
1007
+ elif isinstance(headers, list) and headers and isinstance(headers[0], str):
1008
+ # Find elements for each header text with exact matching
1009
+ header_elements = []
1010
+ for header_text in headers:
1011
+ # Find all text elements and filter for exact match
1012
+ all_text = target_obj.find_all("text")
1013
+ exact_matches = [elem for elem in all_text if elem.extract_text() == header_text]
1014
+
1015
+ if exact_matches:
1016
+ # Use the first exact match
1017
+ header_elements.append(exact_matches[0])
1018
+ else:
1019
+ logger.warning(f"Could not find header text: {header_text}")
1020
+
1021
+ if not header_elements:
1022
+ logger.warning("No header elements found from provided text strings")
1023
+ return self._parent
1000
1024
  else:
1001
1025
  header_elements = list(headers)
1002
1026
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.19
3
+ Version: 0.2.20
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -3,7 +3,7 @@ natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
3
3
  natural_pdf/judge.py,sha256=mRPJfdIkkL_Y6uQXnb3Wtrna04XlhPrDvxPrDiVevH4,58838
4
4
  natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
5
5
  natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
6
- natural_pdf/analyzers/guides.py,sha256=BqFgt-bRSOkEoFCvNsYyY8j__00X-8DJ_TLb2Hx9qsQ,202430
6
+ natural_pdf/analyzers/guides.py,sha256=SCliIk36Bu-7rMwp0K_XO0MtTVokXvxre4FNysYowiE,203626
7
7
  natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
8
8
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
9
9
  natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
@@ -119,7 +119,7 @@ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2
119
119
  natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
120
120
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
121
121
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
122
- natural_pdf-0.2.19.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
122
+ natural_pdf-0.2.20.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
123
123
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
124
124
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
125
125
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
@@ -172,8 +172,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
172
172
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
173
173
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
174
174
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
175
- natural_pdf-0.2.19.dist-info/METADATA,sha256=vtMsWwMW9cR2LdQhdDFhDG4WWIkctrT7_3P7klvyJ-8,6960
176
- natural_pdf-0.2.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
177
- natural_pdf-0.2.19.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
178
- natural_pdf-0.2.19.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
179
- natural_pdf-0.2.19.dist-info/RECORD,,
175
+ natural_pdf-0.2.20.dist-info/METADATA,sha256=zryX6u3cPk4iuwz_leohWyd-i0SdvM_hr-h1APjffWQ,6960
176
+ natural_pdf-0.2.20.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
177
+ natural_pdf-0.2.20.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
178
+ natural_pdf-0.2.20.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
179
+ natural_pdf-0.2.20.dist-info/RECORD,,