natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +33 -1
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/collections/pdf_collection.py +259 -0
- natural_pdf/core/page.py +97 -69
- natural_pdf/core/pdf.py +382 -171
- natural_pdf/elements/region.py +55 -26
- natural_pdf/exporters/__init__.py +1 -0
- natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf/search/__init__.py +94 -0
- natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf/search/search_options.py +72 -0
- natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf/search/searchable_mixin.py +464 -0
- natural_pdf-0.1.3.dist-info/METADATA +137 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/RECORD +22 -13
- natural_pdf-0.1.1.dist-info/METADATA +0 -295
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -18,7 +18,7 @@ class Region(DirectionalMixin):
|
|
18
18
|
Represents a rectangular region on a page.
|
19
19
|
"""
|
20
20
|
|
21
|
-
def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None):
|
21
|
+
def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None, label: Optional[str] = None):
|
22
22
|
"""
|
23
23
|
Initialize a region.
|
24
24
|
|
@@ -27,6 +27,7 @@ class Region(DirectionalMixin):
|
|
27
27
|
bbox: Bounding box as (x0, top, x1, bottom)
|
28
28
|
polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
|
29
29
|
parent: Optional parent region (for hierarchical document structure)
|
30
|
+
label: Optional label for the region (e.g., for exclusions)
|
30
31
|
"""
|
31
32
|
self._page = page
|
32
33
|
self._bbox = bbox
|
@@ -49,6 +50,7 @@ class Region(DirectionalMixin):
|
|
49
50
|
# Region management attributes
|
50
51
|
self.name = None
|
51
52
|
self.source = None # Will be set by creation methods
|
53
|
+
self.label = label
|
52
54
|
|
53
55
|
# Hierarchy support for nested document structure
|
54
56
|
self.parent_region = parent
|
@@ -1514,48 +1516,75 @@ class Region(DirectionalMixin):
|
|
1514
1516
|
|
1515
1517
|
def create_cells(self):
|
1516
1518
|
"""
|
1517
|
-
Create cell regions for a
|
1519
|
+
Create cell regions for a detected table by intersecting its
|
1520
|
+
row and column regions, and add them to the page.
|
1518
1521
|
|
1522
|
+
Assumes child row and column regions are already present on the page.
|
1523
|
+
|
1519
1524
|
Returns:
|
1520
|
-
|
1525
|
+
Self for method chaining.
|
1521
1526
|
"""
|
1522
|
-
|
1523
|
-
|
1527
|
+
# Ensure this is called on a table region
|
1528
|
+
if self.region_type not in ('table', 'tableofcontents'): # Allow for ToC which might have structure
|
1529
|
+
raise ValueError(f"create_cells should be called on a 'table' or 'tableofcontents' region, not '{self.region_type}'")
|
1524
1530
|
|
1525
|
-
# Find rows and columns
|
1526
|
-
|
1527
|
-
|
1531
|
+
# Find rows and columns associated with this page
|
1532
|
+
# Remove the model-specific filter
|
1533
|
+
rows = self.page.find_all('region[type=table-row]')
|
1534
|
+
columns = self.page.find_all('region[type=table-column]')
|
1528
1535
|
|
1529
|
-
# Filter to only include those that overlap with this table
|
1536
|
+
# Filter to only include those that overlap with this table region
|
1530
1537
|
def is_in_table(element):
|
1531
|
-
|
1532
|
-
|
1533
|
-
return (
|
1534
|
-
|
1538
|
+
# Use a simple overlap check (more robust than just center point)
|
1539
|
+
# Check if element's bbox overlaps with self.bbox
|
1540
|
+
return (element.x0 < self.x1 and element.x1 > self.x0 and
|
1541
|
+
element.top < self.bottom and element.bottom > self.top)
|
1535
1542
|
|
1536
1543
|
table_rows = [r for r in rows if is_in_table(r)]
|
1537
1544
|
table_columns = [c for c in columns if is_in_table(c)]
|
1538
1545
|
|
1546
|
+
if not table_rows or not table_columns:
|
1547
|
+
self._page.logger.warning(f"Region {self.bbox}: Cannot create cells. No overlapping row or column regions found.")
|
1548
|
+
return self # Return self even if no cells created
|
1549
|
+
|
1539
1550
|
# Sort rows and columns
|
1540
1551
|
table_rows.sort(key=lambda r: r.top)
|
1541
1552
|
table_columns.sort(key=lambda c: c.x0)
|
1542
1553
|
|
1543
|
-
# Create cells
|
1544
|
-
|
1554
|
+
# Create cells and add them to the page's element manager
|
1555
|
+
created_count = 0
|
1545
1556
|
for row in table_rows:
|
1546
1557
|
for column in table_columns:
|
1547
|
-
#
|
1548
|
-
|
1549
|
-
|
1550
|
-
)
|
1551
|
-
|
1552
|
-
|
1553
|
-
cell
|
1554
|
-
|
1555
|
-
|
1556
|
-
|
1558
|
+
# Calculate intersection bbox for the cell
|
1559
|
+
cell_x0 = max(row.x0, column.x0)
|
1560
|
+
cell_y0 = max(row.top, column.top)
|
1561
|
+
cell_x1 = min(row.x1, column.x1)
|
1562
|
+
cell_y1 = min(row.bottom, column.bottom)
|
1563
|
+
|
1564
|
+
# Only create a cell if the intersection is valid (positive width/height)
|
1565
|
+
if cell_x1 > cell_x0 and cell_y1 > cell_y0:
|
1566
|
+
# Create cell region at the intersection
|
1567
|
+
cell = self.page.create_region(
|
1568
|
+
cell_x0, cell_y0, cell_x1, cell_y1
|
1569
|
+
)
|
1570
|
+
# Set metadata
|
1571
|
+
cell.source = 'derived'
|
1572
|
+
cell.region_type = 'table-cell' # Explicitly set type
|
1573
|
+
cell.normalized_type = 'table-cell' # And normalized type
|
1574
|
+
# Inherit model from the parent table region
|
1575
|
+
cell.model = self.model
|
1576
|
+
cell.parent_region = self # Link cell to parent table region
|
1577
|
+
|
1578
|
+
# Add the cell region to the page's element manager
|
1579
|
+
self.page._element_mgr.add_region(cell)
|
1580
|
+
created_count += 1
|
1557
1581
|
|
1558
|
-
|
1582
|
+
# Optional: Add created cells to the table region's children
|
1583
|
+
# self.child_regions.extend(cells_created_in_this_call) # Needs list management
|
1584
|
+
|
1585
|
+
self._page.logger.info(f"Region {self.bbox} (Model: {self.model}): Created and added {created_count} cell regions.")
|
1586
|
+
|
1587
|
+
return self # Return self for chaining
|
1559
1588
|
|
1560
1589
|
def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
|
1561
1590
|
"""
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,252 @@
|
|
1
|
+
"""
|
2
|
+
Module for exporting PDF content to various formats.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
import tempfile
|
8
|
+
from typing import TYPE_CHECKING, List, Dict, Any, Tuple
|
9
|
+
|
10
|
+
# Lazy imports for optional dependencies
|
11
|
+
try:
|
12
|
+
from PIL import Image
|
13
|
+
except ImportError:
|
14
|
+
Image = None # type: ignore
|
15
|
+
|
16
|
+
try:
|
17
|
+
import pikepdf
|
18
|
+
except ImportError:
|
19
|
+
pikepdf = None # type: ignore
|
20
|
+
|
21
|
+
try:
|
22
|
+
from ocrmypdf.hocrtransform import HocrTransform
|
23
|
+
except ImportError:
|
24
|
+
HocrTransform = None # type: ignore
|
25
|
+
|
26
|
+
if TYPE_CHECKING:
|
27
|
+
from natural_pdf.core.pdf import PDF
|
28
|
+
from natural_pdf.core.page import Page
|
29
|
+
|
30
|
+
|
31
|
+
logger = logging.getLogger(__name__)
|
32
|
+
|
33
|
+
# --- Constants ---
|
34
|
+
HOCR_TEMPLATE_HEADER = '''<?xml version="1.0" encoding="UTF-8"?>
|
35
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
36
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
37
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
38
|
+
<head>
|
39
|
+
<title></title>
|
40
|
+
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
41
|
+
<meta name='ocr-system' content='natural-pdf' />
|
42
|
+
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
43
|
+
</head>
|
44
|
+
<body>
|
45
|
+
'''
|
46
|
+
|
47
|
+
HOCR_TEMPLATE_PAGE = ''' <div class='ocr_page' id='page_{page_num}' title='image "{image_path}"; bbox 0 0 {width} {height}; ppageno {page_num}'>
|
48
|
+
'''
|
49
|
+
|
50
|
+
HOCR_TEMPLATE_WORD = ''' <span class='ocrx_word' id='word_{page_num}_{word_id}' title='bbox {x0} {y0} {x1} {y1}; x_wconf {confidence}'>{text}</span>
|
51
|
+
'''
|
52
|
+
|
53
|
+
HOCR_TEMPLATE_LINE_START = ''' <span class='ocr_line' id='line_{page_num}_{line_id}' title='bbox {x0} {y0} {x1} {y1}'>
|
54
|
+
'''
|
55
|
+
HOCR_TEMPLATE_LINE_END = ''' </span>
|
56
|
+
'''
|
57
|
+
|
58
|
+
HOCR_TEMPLATE_FOOTER = ''' </div>
|
59
|
+
</body>
|
60
|
+
</html>
|
61
|
+
'''
|
62
|
+
# --- End Constants ---
|
63
|
+
|
64
|
+
|
65
|
+
def _generate_hocr_for_page(page: 'Page', image_width: int, image_height: int) -> str:
|
66
|
+
"""
|
67
|
+
Generates an hOCR string for a given Page object based on its OCR elements.
|
68
|
+
|
69
|
+
Args:
|
70
|
+
page: The Page object containing OCR elements (TextElements).
|
71
|
+
image_width: The width of the rendered image for coordinate scaling.
|
72
|
+
image_height: The height of the rendered image for coordinate scaling.
|
73
|
+
|
74
|
+
Returns:
|
75
|
+
An hOCR XML string.
|
76
|
+
|
77
|
+
Raises:
|
78
|
+
ValueError: If the page has no OCR elements.
|
79
|
+
"""
|
80
|
+
# Attempt to get OCR elements (words) using find_all with selector
|
81
|
+
# Use find_all which returns an ElementCollection
|
82
|
+
ocr_elements_collection = page.find_all('text[source=ocr]')
|
83
|
+
ocr_elements = ocr_elements_collection.elements # Get the list of elements
|
84
|
+
|
85
|
+
if not ocr_elements:
|
86
|
+
logger.warning(f"Page {page.number} has no OCR elements (text[source=ocr]) to generate hOCR from.")
|
87
|
+
# Return minimal valid hOCR for an empty page
|
88
|
+
hocr_content = HOCR_TEMPLATE_HEADER
|
89
|
+
hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height)
|
90
|
+
hocr_content += HOCR_TEMPLATE_FOOTER
|
91
|
+
return hocr_content
|
92
|
+
|
93
|
+
|
94
|
+
# --- TODO: Implement logic to group words into lines if necessary ---
|
95
|
+
# For now, just output words directly. A more advanced implementation
|
96
|
+
# might group words geometrically into lines first.
|
97
|
+
# Example (simple, assuming elements are somewhat sorted):
|
98
|
+
# lines = []
|
99
|
+
# current_line = []
|
100
|
+
# last_y = -1
|
101
|
+
# for word in ocr_elements:
|
102
|
+
# if not current_line or abs(word.y0 - last_y) < threshold: # Simple Y-based grouping
|
103
|
+
# current_line.append(word)
|
104
|
+
# last_y = word.y0
|
105
|
+
# else:
|
106
|
+
# lines.append(current_line)
|
107
|
+
# current_line = [word]
|
108
|
+
# last_y = word.y0
|
109
|
+
# if current_line:
|
110
|
+
# lines.append(current_line)
|
111
|
+
# --- End Line Grouping Placeholder ---
|
112
|
+
|
113
|
+
|
114
|
+
hocr_content = HOCR_TEMPLATE_HEADER
|
115
|
+
hocr_content += HOCR_TEMPLATE_PAGE.format(page_num=page.index, image_path="", width=image_width, height=image_height) # image_path is often unused
|
116
|
+
|
117
|
+
# Scale factors from PDF points (page dims) to image pixels (rendered image dims)
|
118
|
+
# Note: Assumes OCR element coordinates are in PDF points (page.width/height)
|
119
|
+
scale_x = image_width / page.width if page.width > 0 else 1
|
120
|
+
scale_y = image_height / page.height if page.height > 0 else 1
|
121
|
+
|
122
|
+
word_id_counter = 0
|
123
|
+
for word in ocr_elements:
|
124
|
+
# Scale coordinates to image dimensions
|
125
|
+
img_x0 = int(word.x0 * scale_x)
|
126
|
+
img_y0 = int(word.y0 * scale_y)
|
127
|
+
img_x1 = int(word.x1 * scale_x)
|
128
|
+
img_y1 = int(word.y1 * scale_y)
|
129
|
+
|
130
|
+
# Ensure coordinates are within image bounds
|
131
|
+
img_x0 = max(0, img_x0)
|
132
|
+
img_y0 = max(0, img_y0)
|
133
|
+
img_x1 = min(image_width, img_x1)
|
134
|
+
img_y1 = min(image_height, img_y1)
|
135
|
+
|
136
|
+
# Basic escaping for XML - might need more robust escaping
|
137
|
+
text = word.text.replace('&', '&').replace('<', '<').replace('>', '>')
|
138
|
+
|
139
|
+
# Confidence (assuming it exists, default to 99 if not)
|
140
|
+
confidence = getattr(word, 'confidence', 0.99) * 100 # hOCR often uses 0-100
|
141
|
+
|
142
|
+
hocr_content += HOCR_TEMPLATE_WORD.format(
|
143
|
+
page_num=page.index,
|
144
|
+
word_id=word_id_counter,
|
145
|
+
x0=img_x0,
|
146
|
+
y0=img_y0,
|
147
|
+
x1=img_x1,
|
148
|
+
y1=img_y1,
|
149
|
+
confidence=int(confidence),
|
150
|
+
text=text
|
151
|
+
)
|
152
|
+
word_id_counter += 1
|
153
|
+
hocr_content += "\n" # Add newline for readability
|
154
|
+
|
155
|
+
|
156
|
+
hocr_content += HOCR_TEMPLATE_FOOTER
|
157
|
+
return hocr_content
|
158
|
+
|
159
|
+
|
160
|
+
def create_searchable_pdf(pdf_object: 'PDF', output_path: str, dpi: int = 300):
|
161
|
+
"""
|
162
|
+
Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
|
163
|
+
|
164
|
+
Relies on ocrmypdf for hOCR transformation. Requires optional dependencies.
|
165
|
+
|
166
|
+
Args:
|
167
|
+
pdf_object: The natural_pdf.PDF instance (OCR should have been run).
|
168
|
+
output_path: The path to save the resulting searchable PDF.
|
169
|
+
dpi: The resolution (dots per inch) for rendering page images and hOCR.
|
170
|
+
"""
|
171
|
+
# _check_dependencies() # Removed check
|
172
|
+
|
173
|
+
# --- Ensure dependencies are loaded (they should be if installed) ---
|
174
|
+
if Image is None or pikepdf is None or HocrTransform is None:
|
175
|
+
# This should ideally not happen if dependencies are in main install,
|
176
|
+
# but serves as a safeguard during development or if install is broken.
|
177
|
+
raise ImportError(
|
178
|
+
"Required dependencies (Pillow, pikepdf, ocrmypdf) are missing. "
|
179
|
+
"Please ensure natural-pdf is installed correctly with all dependencies."
|
180
|
+
)
|
181
|
+
# --- End Safeguard Check ---
|
182
|
+
|
183
|
+
logger.info(f"Starting searchable PDF creation for '{pdf_object.source_path}' -> '{output_path}' at {dpi} DPI.")
|
184
|
+
|
185
|
+
temp_pdf_pages: List[str] = []
|
186
|
+
output_abs_path = os.path.abspath(output_path)
|
187
|
+
|
188
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
189
|
+
logger.debug(f"Using temporary directory: {tmpdir}")
|
190
|
+
|
191
|
+
for i, page in enumerate(pdf_object.pages):
|
192
|
+
logger.debug(f"Processing page {page.number} (index {i})...")
|
193
|
+
page_base_name = f"page_{i}"
|
194
|
+
img_path = os.path.join(tmpdir, f"{page_base_name}.png") # Use PNG for potentially better quality
|
195
|
+
hocr_path = os.path.join(tmpdir, f"{page_base_name}.hocr")
|
196
|
+
pdf_page_path = os.path.join(tmpdir, f"{page_base_name}.pdf")
|
197
|
+
|
198
|
+
try:
|
199
|
+
# 1. Render page image at target DPI
|
200
|
+
logger.debug(f" Rendering page {i} to image ({dpi} DPI)...")
|
201
|
+
# Use the Page's to_image method
|
202
|
+
pil_image = page.to_image(resolution=dpi, include_highlights=False)
|
203
|
+
pil_image.save(img_path, format='PNG')
|
204
|
+
img_width, img_height = pil_image.size
|
205
|
+
logger.debug(f" Image saved to {img_path} ({img_width}x{img_height})")
|
206
|
+
|
207
|
+
# 2. Generate hOCR
|
208
|
+
logger.debug(f" Generating hOCR...")
|
209
|
+
hocr_content = _generate_hocr_for_page(page, img_width, img_height)
|
210
|
+
with open(hocr_path, 'w', encoding='utf-8') as f:
|
211
|
+
f.write(hocr_content)
|
212
|
+
logger.debug(f" hOCR saved to {hocr_path}")
|
213
|
+
|
214
|
+
|
215
|
+
# 3. Use HocrTransform to create searchable PDF page
|
216
|
+
logger.debug(f" Running HocrTransform...")
|
217
|
+
hocr_transform = HocrTransform(hocr_filename=hocr_path, dpi=dpi)
|
218
|
+
# Pass image_filename explicitly
|
219
|
+
hocr_transform.to_pdf(out_filename=pdf_page_path, image_filename=img_path)
|
220
|
+
temp_pdf_pages.append(pdf_page_path)
|
221
|
+
logger.debug(f" Temporary PDF page saved to {pdf_page_path}")
|
222
|
+
|
223
|
+
except Exception as e:
|
224
|
+
logger.error(f" Failed to process page {page.number}: {e}", exc_info=True)
|
225
|
+
# Decide whether to skip or raise error
|
226
|
+
# For now, let's skip and continue
|
227
|
+
logger.warning(f" Skipping page {page.number} due to error.")
|
228
|
+
continue # Skip to the next page
|
229
|
+
|
230
|
+
# 4. Merge temporary PDF pages
|
231
|
+
if not temp_pdf_pages:
|
232
|
+
logger.error("No pages were successfully processed. Cannot create output PDF.")
|
233
|
+
raise RuntimeError("Failed to process any pages for searchable PDF creation.")
|
234
|
+
|
235
|
+
logger.info(f"Merging {len(temp_pdf_pages)} processed pages into final PDF...")
|
236
|
+
try:
|
237
|
+
# Use pikepdf for merging
|
238
|
+
output_pdf = pikepdf.Pdf.new()
|
239
|
+
for temp_pdf_path in temp_pdf_pages:
|
240
|
+
with pikepdf.Pdf.open(temp_pdf_path) as src_page_pdf:
|
241
|
+
# Assuming each temp PDF has exactly one page
|
242
|
+
if len(src_page_pdf.pages) == 1:
|
243
|
+
output_pdf.pages.append(src_page_pdf.pages[0])
|
244
|
+
else:
|
245
|
+
logger.warning(f"Temporary PDF '{temp_pdf_path}' had unexpected number of pages ({len(src_page_pdf.pages)}). Skipping.")
|
246
|
+
output_pdf.save(output_abs_path)
|
247
|
+
logger.info(f"Successfully saved merged searchable PDF to: {output_abs_path}")
|
248
|
+
except Exception as e:
|
249
|
+
logger.error(f"Failed to merge temporary PDFs into '{output_abs_path}': {e}", exc_info=True)
|
250
|
+
raise RuntimeError(f"Failed to save final PDF: {e}") from e
|
251
|
+
|
252
|
+
logger.debug("Temporary directory cleaned up.")
|
@@ -0,0 +1,94 @@
|
|
1
|
+
"""Makes search functionality easily importable and provides factory functions."""
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
# --- Service Implementation Import ---
|
7
|
+
# Import the concrete implementation
|
8
|
+
from .haystack_search_service import HaystackSearchService
|
9
|
+
|
10
|
+
# --- Protocol Import ---
|
11
|
+
# Import the protocol for type hinting
|
12
|
+
from .search_service_protocol import (
|
13
|
+
SearchServiceProtocol,
|
14
|
+
IndexConfigurationError,
|
15
|
+
Indexable
|
16
|
+
)
|
17
|
+
|
18
|
+
# --- Option Imports (for convenience) ---
|
19
|
+
# Make options easily available via `from natural_pdf.search import ...`
|
20
|
+
from .search_options import (
|
21
|
+
BaseSearchOptions,
|
22
|
+
SearchOptions, # Alias for TextSearchOptions for simplicity?
|
23
|
+
TextSearchOptions,
|
24
|
+
MultiModalSearchOptions
|
25
|
+
)
|
26
|
+
# --- Utils Import ---
|
27
|
+
from .haystack_utils import HAS_HAYSTACK_EXTRAS, check_haystack_availability # Re-export flag and helper
|
28
|
+
|
29
|
+
logger = logging.getLogger(__name__)
|
30
|
+
|
31
|
+
# --- Factory Function ---
|
32
|
+
|
33
|
+
def get_search_service(
|
34
|
+
collection_name: str, # Add collection_name as a required argument
|
35
|
+
persist: bool = False, # Default to In-Memory
|
36
|
+
# Configuration for the service itself
|
37
|
+
default_persist_path: Optional[str] = None,
|
38
|
+
default_embedding_model: Optional[str] = None,
|
39
|
+
# Potential future args: cache_services=True? service_type='haystack'?
|
40
|
+
) -> SearchServiceProtocol:
|
41
|
+
"""
|
42
|
+
Factory function to get an instance of the configured search service.
|
43
|
+
|
44
|
+
A service instance is tied to a specific collection name.
|
45
|
+
|
46
|
+
Currently, only returns HaystackSearchService but is structured for future extension.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
collection_name: The name of the collection this service instance will manage.
|
50
|
+
persist: If True, creates a service instance configured for persistent
|
51
|
+
storage (ChromaDB). If False (default), uses In-Memory.
|
52
|
+
default_persist_path: Override the default path for persistent storage.
|
53
|
+
default_embedding_model: Override the default embedding model used by the service.
|
54
|
+
**kwargs: Reserved for future configuration options.
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
An instance conforming to the SearchServiceProtocol for the specified collection.
|
58
|
+
"""
|
59
|
+
logger.debug(f"Calling get_search_service factory for collection '{collection_name}' (persist={persist})...")
|
60
|
+
|
61
|
+
# For now, we only have one implementation
|
62
|
+
# Collect arguments relevant to HaystackSearchService.__init__
|
63
|
+
service_args = {}
|
64
|
+
service_args['collection_name'] = collection_name # Pass collection_name
|
65
|
+
service_args['persist'] = persist # Pass persist flag to service constructor
|
66
|
+
if default_persist_path is not None:
|
67
|
+
service_args['default_persist_path'] = default_persist_path
|
68
|
+
if default_embedding_model is not None:
|
69
|
+
service_args['default_embedding_model'] = default_embedding_model
|
70
|
+
|
71
|
+
# TODO: Implement caching/registry if needed to return the same instance
|
72
|
+
# for the same configuration instead of always creating a new one.
|
73
|
+
# cache_key = tuple(sorted(service_args.items()))
|
74
|
+
# if cache_key in _service_instance_cache:
|
75
|
+
# return _service_instance_cache[cache_key]
|
76
|
+
|
77
|
+
try:
|
78
|
+
service_instance = HaystackSearchService(**service_args)
|
79
|
+
# _service_instance_cache[cache_key] = service_instance
|
80
|
+
logger.info(f"Created new HaystackSearchService instance for collection '{collection_name}'.")
|
81
|
+
return service_instance
|
82
|
+
except ImportError as e:
|
83
|
+
logger.error(f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True)
|
84
|
+
raise ImportError("Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]") from e
|
85
|
+
except Exception as e:
|
86
|
+
logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
|
87
|
+
raise RuntimeError("Could not create Search Service instance.") from e
|
88
|
+
|
89
|
+
# --- Optional: Define a default instance for extreme ease of use? ---
|
90
|
+
# try:
|
91
|
+
# default_search_service = get_search_service()
|
92
|
+
# except Exception:
|
93
|
+
# default_search_service = None
|
94
|
+
# logger.warning("Could not create default search service instance on import.")
|