natural-pdf 0.2.19__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +26 -2
- {natural_pdf-0.2.19.dist-info → natural_pdf-0.2.20.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.19.dist-info → natural_pdf-0.2.20.dist-info}/RECORD +7 -7
- {natural_pdf-0.2.19.dist-info → natural_pdf-0.2.20.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.19.dist-info → natural_pdf-0.2.20.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.19.dist-info → natural_pdf-0.2.20.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.19.dist-info → natural_pdf-0.2.20.dist-info}/top_level.txt +0 -0
natural_pdf/analyzers/guides.py
CHANGED
@@ -943,7 +943,7 @@ class GuidesList(UserList):
|
|
943
943
|
|
944
944
|
def from_headers(
|
945
945
|
self,
|
946
|
-
headers: Union["ElementCollection", List["Element"]],
|
946
|
+
headers: Union["ElementCollection", List["Element"], List[str]],
|
947
947
|
obj: Optional[Union["Page", "Region"]] = None,
|
948
948
|
method: Literal["min_crossings", "seam_carving"] = "min_crossings",
|
949
949
|
min_width: Optional[float] = None,
|
@@ -960,7 +960,10 @@ class GuidesList(UserList):
|
|
960
960
|
between headers that minimize text crossings, regardless of text alignment.
|
961
961
|
|
962
962
|
Args:
|
963
|
-
headers: Column header elements
|
963
|
+
headers: Column header elements. Can be:
|
964
|
+
- ElementCollection: collection of header elements
|
965
|
+
- List[Element]: list of header elements
|
966
|
+
- List[str]: list of header text to search for
|
964
967
|
obj: Page/Region to analyze (uses parent's context if None)
|
965
968
|
method: Detection method:
|
966
969
|
- 'min_crossings': Fast vector-based minimum intersection count
|
@@ -980,6 +983,9 @@ class GuidesList(UserList):
|
|
980
983
|
headers = page.find_all('text[size=16]')
|
981
984
|
guides.vertical.from_headers(headers)
|
982
985
|
|
986
|
+
# From header text strings
|
987
|
+
guides.vertical.from_headers(["Statute", "Description", "Level", "Repeat"])
|
988
|
+
|
983
989
|
# With width constraints
|
984
990
|
guides.vertical.from_headers(headers, min_width=50, max_width=200)
|
985
991
|
|
@@ -997,6 +1003,24 @@ class GuidesList(UserList):
|
|
997
1003
|
# Convert headers to list if ElementCollection
|
998
1004
|
if hasattr(headers, "elements"):
|
999
1005
|
header_elements = list(headers.elements)
|
1006
|
+
# Check if headers is a list of strings
|
1007
|
+
elif isinstance(headers, list) and headers and isinstance(headers[0], str):
|
1008
|
+
# Find elements for each header text with exact matching
|
1009
|
+
header_elements = []
|
1010
|
+
for header_text in headers:
|
1011
|
+
# Find all text elements and filter for exact match
|
1012
|
+
all_text = target_obj.find_all("text")
|
1013
|
+
exact_matches = [elem for elem in all_text if elem.extract_text() == header_text]
|
1014
|
+
|
1015
|
+
if exact_matches:
|
1016
|
+
# Use the first exact match
|
1017
|
+
header_elements.append(exact_matches[0])
|
1018
|
+
else:
|
1019
|
+
logger.warning(f"Could not find header text: {header_text}")
|
1020
|
+
|
1021
|
+
if not header_elements:
|
1022
|
+
logger.warning("No header elements found from provided text strings")
|
1023
|
+
return self._parent
|
1000
1024
|
else:
|
1001
1025
|
header_elements = list(headers)
|
1002
1026
|
|
@@ -3,7 +3,7 @@ natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
|
|
3
3
|
natural_pdf/judge.py,sha256=mRPJfdIkkL_Y6uQXnb3Wtrna04XlhPrDvxPrDiVevH4,58838
|
4
4
|
natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
|
5
5
|
natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
|
6
|
-
natural_pdf/analyzers/guides.py,sha256=
|
6
|
+
natural_pdf/analyzers/guides.py,sha256=SCliIk36Bu-7rMwp0K_XO0MtTVokXvxre4FNysYowiE,203626
|
7
7
|
natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLSrXh9BB2AmLxUoHmw0,62584
|
8
8
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
9
9
|
natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
|
@@ -119,7 +119,7 @@ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2
|
|
119
119
|
natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
|
120
120
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
121
121
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
122
|
-
natural_pdf-0.2.
|
122
|
+
natural_pdf-0.2.20.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
123
123
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
124
124
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
125
125
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
@@ -172,8 +172,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
172
172
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
173
173
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
174
174
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
175
|
-
natural_pdf-0.2.
|
176
|
-
natural_pdf-0.2.
|
177
|
-
natural_pdf-0.2.
|
178
|
-
natural_pdf-0.2.
|
179
|
-
natural_pdf-0.2.
|
175
|
+
natural_pdf-0.2.20.dist-info/METADATA,sha256=zryX6u3cPk4iuwz_leohWyd-i0SdvM_hr-h1APjffWQ,6960
|
176
|
+
natural_pdf-0.2.20.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
177
|
+
natural_pdf-0.2.20.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
178
|
+
natural_pdf-0.2.20.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
|
179
|
+
natural_pdf-0.2.20.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|