natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/__init__.py +29 -40
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +20 -18
  8. natural_pdf/core/pdf.py +146 -13
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +374 -30
  11. natural_pdf/elements/region.py +45 -14
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +519 -0
  16. natural_pdf/exporters/hocr_font.py +136 -0
  17. natural_pdf/exporters/original_pdf.py +127 -0
  18. natural_pdf/exporters/searchable_pdf.py +2 -12
  19. natural_pdf/ocr/engine_surya.py +1 -1
  20. natural_pdf/search/__init__.py +65 -52
  21. natural_pdf/search/lancedb_search_service.py +325 -0
  22. natural_pdf/search/numpy_search_service.py +255 -0
  23. natural_pdf/search/searchable_mixin.py +25 -71
  24. natural_pdf/widgets/viewer.py +22 -31
  25. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
  26. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
  27. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  28. natural_pdf/search/haystack_search_service.py +0 -687
  29. natural_pdf/search/haystack_utils.py +0 -474
  30. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,474 +0,0 @@
1
- # natural_pdf/search/haystack_utils.py
2
- import logging
3
- import os
4
- import warnings
5
- from pathlib import Path
6
- from typing import Any, Dict, List, Optional, Tuple, Type, Union
7
-
8
- from PIL import Image # Ensure Image is imported unconditionally
9
-
10
- from natural_pdf.search.search_options import (
11
- BaseSearchOptions,
12
- MultiModalSearchOptions,
13
- SearchOptions,
14
- TextSearchOptions,
15
- )
16
-
17
- # Set up logger for this module
18
- logger = logging.getLogger(__name__)
19
-
20
- # Import sentence-transformers for dimension calculation
21
- try:
22
- from sentence_transformers import SentenceTransformer
23
- except ImportError:
24
- SentenceTransformer = None
25
-
26
- # --- Define flag BEFORE trying Haystack imports ---
27
- HAS_HAYSTACK_EXTRAS = False # Default to False
28
-
29
- # Conditional Haystack Imports
30
- try:
31
- import haystack
32
- from haystack import Document as HaystackDocument
33
- from haystack import Pipeline
34
- from haystack.components.embedders import (
35
- SentenceTransformersDocumentEmbedder,
36
- SentenceTransformersTextEmbedder,
37
- )
38
- from haystack.document_stores.types import DocumentStore, DuplicatePolicy
39
-
40
- # --- REMOVED Chroma Imports ---
41
- # from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
42
- # from haystack_integrations.document_stores.chroma import ChromaDocumentStore
43
- # --- ADDED LanceDB Imports ---
44
- try:
45
- from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever
46
- except ImportError:
47
- LanceDBDocumentStore = None
48
- LanceDBEmbeddingRetriever = None
49
-
50
- # Removed Chroma Imports
51
-
52
- # Keep try/except for optional Cohere
53
- try:
54
- from haystack.components.rankers import CohereRanker
55
- except ImportError:
56
- CohereRanker = None
57
-
58
- # Removed ChromaDB embedding function import
59
-
60
- HAS_HAYSTACK_EXTRAS = True # Set to True if imports succeed
61
- logger.debug("Successfully imported Haystack components.")
62
-
63
- except ImportError as e:
64
- # HAS_HAYSTACK_EXTRAS remains False
65
- # Log the full error and traceback for debugging
66
- logger.warning(
67
- f"Failed to import Haystack components. Semantic search functionality disabled.",
68
- )
69
-
70
- # Define dummy types/classes for type hinting and basic checks when extras aren't installed
71
- BaseDocumentStore = object
72
- DocumentStore = object # Dummy for protocol
73
- BaseEmbedder = object # Define dummy BaseEmbedder
74
- BaseTextEmbedder = object
75
- HaystackDocument = Dict # Represent as Dict if not available
76
- Pipeline = None
77
- SentenceTransformersTextEmbedder = None
78
- # --- UPDATED Dummies ---
79
- LanceDBEmbeddingRetriever = (
80
- None # ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
81
- )
82
- CohereRanker = None
83
- LanceDBDocumentStore = None # ChromaDocumentStore = None
84
- DuplicatePolicy = None # Dummy for DuplicatePolicy
85
- # --- REMOVED Dummies ---
86
- # SentenceTransformerEmbeddingFunction = None # Dummy if kept
87
-
88
-
89
- # Helper function to check availability and raise error
90
- def check_haystack_availability(feature_name: str = "Search"):
91
- """Raises ImportError if Haystack extras are not installed."""
92
- if not HAS_HAYSTACK_EXTRAS:
93
- raise ImportError(
94
- f"'{feature_name}' requires Haystack extras. "
95
- "Please install them using: pip install natural-pdf[haystack]"
96
- )
97
-
98
-
99
- # ===========================
100
- # Default Component Creators
101
- # ===========================
102
-
103
-
104
- def create_default_document_store(
105
- # --- CHANGED persist_path to uri ---
106
- uri: str = "./natural_pdf_index",
107
- collection_name: str = "natural_pdf_default", # LanceDB calls this table_name
108
- embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", # Make mandatory for dim calculation
109
- ) -> DocumentStore:
110
- """Creates a default LanceDB DocumentStore."""
111
- check_haystack_availability("create_default_document_store (LanceDB)")
112
- logger.debug(
113
- f"Creating default LanceDBDocumentStore at uri='{uri}' with table '{collection_name}'"
114
- )
115
- if not LanceDBDocumentStore:
116
- raise RuntimeError("LanceDBDocumentStore is not available despite Haystack extras check.")
117
- if not SentenceTransformer:
118
- raise ImportError(
119
- "sentence-transformers library is required to determine embedding dimensions."
120
- )
121
-
122
- try:
123
- # Calculate embedding dimension
124
- try:
125
- model = SentenceTransformer(embedding_model)
126
- embedding_dims = model.get_sentence_embedding_dimension()
127
- if not embedding_dims:
128
- raise ValueError(
129
- f"Could not determine embedding dimension for model: {embedding_model}"
130
- )
131
- logger.debug(
132
- f"Determined embedding dimension: {embedding_dims} for model '{embedding_model}'"
133
- )
134
- except Exception as e:
135
- logger.error(
136
- f"Failed to load SentenceTransformer model '{embedding_model}' to get dimensions: {e}",
137
- exc_info=True,
138
- )
139
- raise RuntimeError(
140
- f"Failed to determine embedding dimension for model '{embedding_model}'."
141
- ) from e
142
-
143
- # Create LanceDBDocumentStore
144
- store = LanceDBDocumentStore(
145
- database=uri, # Use uri for the database path
146
- table_name=collection_name,
147
- embedding_dims=embedding_dims,
148
- # LanceDB might require a metadata schema, but let's try without it first for simplicity.
149
- # Add `metadata_schema=...` if needed based on lancedb-haystack requirements.
150
- )
151
- logger.info(
152
- f"Initialized LanceDBDocumentStore (Table: {collection_name}, Dims: {embedding_dims}) at uri '{uri}'"
153
- )
154
- return store
155
- except Exception as e:
156
- logger.error(f"Failed to initialize LanceDBDocumentStore: {e}", exc_info=True)
157
- raise RuntimeError(
158
- f"Could not create LanceDBDocumentStore for table '{collection_name}' at uri '{uri}'"
159
- ) from e
160
-
161
-
162
- def create_default_text_embedder(
163
- model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
164
- device: Optional[str] = None, # Add device parameter
165
- ) -> SentenceTransformersTextEmbedder:
166
- """Creates a default SentenceTransformer text embedder."""
167
- check_haystack_availability("create_default_text_embedder")
168
- logger.debug(f"Creating default SentenceTransformersTextEmbedder with model '{model_name}'")
169
- if not SentenceTransformersTextEmbedder:
170
- raise RuntimeError("SentenceTransformersTextEmbedder not available.")
171
- try:
172
- # Use Haystack component which handles device logic
173
- embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
174
- logger.info(
175
- f"Initialized SentenceTransformersTextEmbedder (Model: {model_name}, Device: {embedder.device})"
176
- )
177
- return embedder
178
- except Exception as e:
179
- logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
180
- raise RuntimeError(
181
- f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
182
- ) from e
183
-
184
-
185
- def create_default_multimodal_embedder(*args, **kwargs) -> Any:
186
- """Stub for creating a default multimodal embedder (Not Implemented)."""
187
- logger.error("Default multimodal embedder creation is not yet implemented.")
188
- raise NotImplementedError(
189
- "Creating a default multimodal embedder requires a custom component or integration not yet implemented."
190
- " See: https://docs.haystack.deepset.ai/docs/custom-components"
191
- )
192
-
193
-
194
- def create_default_text_reranker(
195
- api_key: Optional[str] = None, model_name: str = "rerank-english-v2.0" # Default Cohere model
196
- ) -> Optional[Any]: # Returns CohereRanker instance or None
197
- """
198
- Creates a default Cohere Reranker if available and API key provided.
199
-
200
- Requires COHERE_API_KEY environment variable or api_key argument.
201
- Requires haystack-cohere integration: pip install haystack-cohere
202
- """
203
- check_haystack_availability("create_default_text_reranker (optional)")
204
-
205
- if not CohereRanker:
206
- logger.debug(
207
- "CohereRanker component not available (haystack-cohere likely not installed). Skipping reranker creation."
208
- )
209
- return None
210
-
211
- # Check for API key (prefer argument over environment variable)
212
- cohere_api_key = api_key or os.environ.get("COHERE_API_KEY")
213
- if not cohere_api_key:
214
- logger.warning(
215
- "COHERE_API_KEY not found in arguments or environment variables. Cannot create Cohere Reranker."
216
- )
217
- return None
218
-
219
- logger.debug(f"Creating CohereRanker with model '{model_name}'")
220
- try:
221
- # Pass API key via authenticator for better practice if supported, or directly
222
- # As of haystack 2.0b5, CohereRanker takes api_key directly
223
- reranker = CohereRanker(api_key=cohere_api_key, model=model_name)
224
- logger.info(f"Initialized CohereRanker (Model: {model_name})")
225
- return reranker
226
- except Exception as e:
227
- logger.error(f"Failed to initialize CohereRanker: {e}", exc_info=True)
228
- # Don't raise, just return None as reranker is optional
229
- return None
230
-
231
-
232
- # --- Default Document Embedder Creator ---
233
- def create_default_document_embedder(
234
- model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
235
- device: Optional[str] = None,
236
- progress_bar: bool = True,
237
- normalize_embeddings: bool = False, # Changed default based on ST documentation
238
- ) -> Any: # Return Any as actual type depends on availability
239
- """Creates a default SentenceTransformersDocumentEmbedder instance.
240
-
241
- Args:
242
- model_name: The Sentence Transformers model name or path.
243
- device: The device (e.g., 'cpu', 'cuda') to use.
244
- progress_bar: Show progress bar during embedding.
245
- normalize_embeddings: Normalize embeddings to unit length.
246
-
247
- Returns:
248
- A SentenceTransformersDocumentEmbedder instance or raises ImportError.
249
-
250
- Raises:
251
- ImportError: If SentenceTransformersDocumentEmbedder is not available.
252
- RuntimeError: If initialization fails.
253
- """
254
- check_haystack_availability("SentenceTransformersDocumentEmbedder")
255
- if not SentenceTransformersDocumentEmbedder:
256
- raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
257
-
258
- # Use the provided device parameter directly.
259
- # If None, Haystack component will likely pick a default (e.g., 'cpu' or 'cuda' if available)
260
- resolved_device = device
261
- logger.debug(
262
- f"Attempting to create SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {resolved_device or 'auto'}"
263
- )
264
-
265
- try:
266
- embedder = SentenceTransformersDocumentEmbedder(
267
- model=model_name,
268
- device=resolved_device,
269
- progress_bar=progress_bar,
270
- normalize_embeddings=normalize_embeddings,
271
- # meta_fields_to_embed=config.get('DOC_EMBEDDER_META_FIELDS', []) # Removed reliance on config
272
- # If embedding meta fields is needed, it should be passed as a parameter
273
- )
274
- embedder.warm_up()
275
- logger.info(
276
- f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {embedder.device}"
277
- ) # Use embedder.device after init
278
- except Exception as e:
279
- logger.error(
280
- f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True
281
- )
282
- raise RuntimeError(
283
- f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'."
284
- ) from e
285
-
286
- return embedder
287
-
288
-
289
- # ===========================
290
- # Helper Functions (Removed _determine_query_embedding)
291
- # ===========================
292
-
293
-
294
- # ===========================
295
- # Central Search Logic
296
- # ===========================
297
-
298
-
299
- def _perform_haystack_search(
300
- query: Union[str, Path, Image.Image],
301
- document_store: Any, # Use Any for simplicity now
302
- collection_name: str, # Passed for clarity, corresponds to table_name in LanceDB
303
- embedder: SentenceTransformersTextEmbedder, # Explicitly expect a text embedder for queries
304
- options: BaseSearchOptions,
305
- ) -> List[Dict[str, Any]]:
306
- """Internal function to perform search using Haystack components (LanceDBEmbeddingRetriever)."""
307
- if not HAS_HAYSTACK_EXTRAS:
308
- check_haystack_availability("_perform_haystack_search (LanceDB)")
309
- return [] # Should not be reached due to check
310
-
311
- logger.info(
312
- f"Performing Haystack search in table '{collection_name}' (using store: {type(document_store).__name__})..."
313
- )
314
- logger.debug(f" Query type: {type(query).__name__}")
315
- logger.debug(f" Options: {options}")
316
-
317
- # Embed Query
318
- text_query: Optional[str] = None
319
- query_embedding: Optional[List[float]] = None
320
-
321
- if isinstance(query, str):
322
- text_query = query
323
- if not embedder:
324
- logger.error(
325
- "Text query provided, but no embedder instance was passed to _perform_haystack_search."
326
- )
327
- return []
328
- try:
329
- logger.debug(f"Running embedder {type(embedder).__name__} on query text...")
330
- embedding_result = embedder.run(text=text_query)
331
- query_embedding = embedding_result.get("embedding")
332
- if not query_embedding:
333
- logger.error(
334
- f"Embedder {type(embedder).__name__} failed to return an embedding for the query: '{text_query[:100]}...'"
335
- )
336
- return []
337
- logger.debug(
338
- f"Generated query embedding (Dim: {len(query_embedding)}). Text kept for potential reranking."
339
- )
340
- except Exception as e:
341
- logger.error(f"Failed to run text embedder on query text: {e}", exc_info=True)
342
- return []
343
- elif isinstance(query, Path) or isinstance(query, Image.Image):
344
- logger.error(
345
- f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text."
346
- )
347
- return []
348
- else:
349
- logger.error(f"Unsupported query type: {type(query).__name__}. Requires text.")
350
- return []
351
-
352
- if query_embedding is None:
353
- logger.error("Could not obtain query embedding. Cannot perform search.")
354
- return []
355
-
356
- # Set up Retriever
357
- if not LanceDBEmbeddingRetriever:
358
- logger.error("LanceDBEmbeddingRetriever not available.")
359
- return []
360
-
361
- # Ensure retriever_top_k is set (should be by __post_init__)
362
- retriever_top_k = options.retriever_top_k
363
- if retriever_top_k is None:
364
- logger.warning(
365
- "options.retriever_top_k was None, defaulting to options.top_k for retriever."
366
- )
367
- retriever_top_k = options.top_k
368
-
369
- # Instantiate the EMBEDDING retriever
370
- retriever = LanceDBEmbeddingRetriever(
371
- document_store=document_store,
372
- filters=options.filters or {}, # Pass filters here
373
- top_k=retriever_top_k,
374
- )
375
-
376
- logger.debug(
377
- f"Initialized LanceDBEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})"
378
- )
379
-
380
- # Set up Optional Reranker
381
- reranker_instance = None
382
- if options.use_reranker in [True, None]:
383
- logger.debug("Attempting to initialize reranker...")
384
- reranker_instance = create_default_text_reranker(
385
- api_key=options.reranker_api_key,
386
- model_name=options.reranker_model or "rerank-english-v2.0",
387
- )
388
- if reranker_instance:
389
- reranker_instance.top_k = options.top_k
390
- logger.info(
391
- f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})"
392
- )
393
- else:
394
- logger.warning(
395
- "Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking."
396
- )
397
-
398
- # Build and Run Pipeline
399
- if not Pipeline:
400
- logger.error("Haystack Pipeline class not available.")
401
- return []
402
-
403
- search_pipeline = Pipeline()
404
- search_pipeline.add_component("retriever", retriever)
405
-
406
- # Define pipeline input based on EMBEDDING retriever needs
407
- pipeline_input = {"retriever": {"query_embedding": query_embedding}}
408
- last_component_name = "retriever"
409
-
410
- if reranker_instance:
411
- search_pipeline.add_component("reranker", reranker_instance)
412
- search_pipeline.connect("retriever.documents", "reranker.documents")
413
- if text_query is None:
414
- logger.error(
415
- "Reranker requires text query, but it was not available (query might not have been text)."
416
- )
417
- logger.warning("Skipping reranker because text query is missing.")
418
- reranker_instance = None
419
- last_component_name = "retriever"
420
- else:
421
- pipeline_input["reranker"] = {
422
- "query": text_query,
423
- "top_k": options.top_k,
424
- }
425
- last_component_name = "reranker"
426
- logger.debug("Added reranker to pipeline and configured input.")
427
- else:
428
- # --- Fix: last_component_name should only be 'reranker' if it was added ---
429
- # if reranker_instance was initialized and added, last_component_name is 'reranker'
430
- # if not, it remains 'retriever'
431
- pass # No change needed here if reranker wasn't added
432
-
433
- logger.info("Running Haystack search pipeline...")
434
- try:
435
- result = search_pipeline.run(pipeline_input)
436
- logger.info("Haystack search pipeline finished.")
437
-
438
- except Exception as e:
439
- logger.error(f"Haystack search pipeline failed: {e}", exc_info=True)
440
- return []
441
-
442
- # Process Results
443
- final_documents: List[HaystackDocument] = []
444
- if last_component_name in result and result[last_component_name].get("documents"):
445
- final_documents = result[last_component_name]["documents"]
446
- logger.debug(
447
- f"Processed results from '{last_component_name}' ({len(final_documents)} documents)."
448
- )
449
- else:
450
- logger.warning(
451
- f"Search pipeline component '{last_component_name}' returned no documents or unexpected output format. Result keys: {result.keys()}"
452
- )
453
- return []
454
-
455
- # Convert Haystack Documents to the desired output format
456
- output_results = []
457
- for doc in final_documents:
458
- doc_id = getattr(doc, "id", None)
459
- doc_score = getattr(doc, "score", 0.0)
460
- doc_content = getattr(doc, "content", None)
461
- doc_meta = getattr(doc, "meta", {})
462
-
463
- meta = doc_meta or {}
464
- output = {
465
- "pdf_path": meta.get("pdf_path", "Unknown"),
466
- "page_number": meta.get("page_number", -1),
467
- "score": doc_score if doc_score is not None else 0.0,
468
- "content_snippet": doc_content[:200] + "..." if doc_content else "",
469
- "metadata": meta,
470
- }
471
- output_results.append(output)
472
-
473
- logger.info(f"Returning {len(output_results)} relevant results.")
474
- return output_results