rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,626 @@
1
+ """
2
+ HTML Loader for processing HTML files.
3
+
4
+ This loader handles HTML documents and provides:
5
+ - HTML parsing and text extraction
6
+ - Script/style tag removal
7
+ - Semantic structure preservation
8
+ - Meta tag extraction (title, description, etc.)
9
+ - Link and image reference extraction
10
+ - Table content extraction
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import os
16
+ import re
17
+ import time
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional, Union
20
+
21
+ from rakam_systems_core.ai_utils import logging
22
+ from rakam_systems_core.ai_core.interfaces.loader import Loader
23
+ from rakam_systems_vectorstore.components.chunker import TextChunker
24
+ from rakam_systems_vectorstore.core import Node, NodeMetadata, VSFile
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class HtmlLoader(Loader):
30
+ """
31
+ HTML loader for processing HTML documents.
32
+
33
+ This loader provides HTML file processing with support for:
34
+ - HTML parsing and clean text extraction
35
+ - Script/style tag removal
36
+ - Semantic structure preservation (headings, paragraphs, lists)
37
+ - Meta tag extraction
38
+ - Configurable text chunking
39
+
40
+ The extracted content is chunked and returned as text or Node objects.
41
+ """
42
+
43
+ # Default configuration
44
+ DEFAULT_CHUNK_SIZE = 3000
45
+ DEFAULT_CHUNK_OVERLAP = 200
46
+ DEFAULT_MIN_SENTENCES_PER_CHUNK = 5
47
+ DEFAULT_TOKENIZER = "character"
48
+
49
+ # Supported HTML file extensions
50
+ SUPPORTED_EXTENSIONS = {'.html', '.htm', '.xhtml'}
51
+
52
+ # Tags to remove entirely (content and all)
53
+ REMOVE_TAGS = ['script', 'style', 'noscript', 'iframe', 'svg', 'canvas']
54
+
55
+ # Tags that indicate section boundaries
56
+ SECTION_TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6',
57
+ 'section', 'article', 'header', 'footer', 'nav', 'aside']
58
+
59
+ def __init__(
60
+ self,
61
+ name: str = "html_loader",
62
+ config: Optional[Dict[str, Any]] = None
63
+ ):
64
+ """
65
+ Initialize HTML loader.
66
+
67
+ Args:
68
+ name: Component name
69
+ config: Optional configuration with keys:
70
+ - chunk_size: Maximum tokens per chunk (default: 3000)
71
+ - chunk_overlap: Overlap between chunks in tokens (default: 200)
72
+ - min_sentences_per_chunk: Minimum sentences per chunk (default: 5)
73
+ - tokenizer: Tokenizer for chunking (default: "character")
74
+ - extract_metadata: Whether to extract meta tags (default: True)
75
+ - preserve_links: Whether to preserve link references (default: False)
76
+ - preserve_structure: Whether to preserve HTML structure hints (default: True)
77
+ - encoding: File encoding (default: "utf-8")
78
+ """
79
+ super().__init__(name=name, config=config)
80
+
81
+ # Extract configuration
82
+ config = config or {}
83
+ self._chunk_size = config.get('chunk_size', self.DEFAULT_CHUNK_SIZE)
84
+ self._chunk_overlap = config.get(
85
+ 'chunk_overlap', self.DEFAULT_CHUNK_OVERLAP)
86
+ self._min_sentences_per_chunk = config.get(
87
+ 'min_sentences_per_chunk', self.DEFAULT_MIN_SENTENCES_PER_CHUNK)
88
+ self._tokenizer = config.get('tokenizer', self.DEFAULT_TOKENIZER)
89
+ self._extract_metadata = config.get('extract_metadata', True)
90
+ self._preserve_links = config.get('preserve_links', False)
91
+ self._preserve_structure = config.get('preserve_structure', True)
92
+ self._encoding = config.get('encoding', 'utf-8')
93
+
94
+ # Initialize text chunker
95
+ self._chunker = TextChunker(
96
+ chunk_size=self._chunk_size,
97
+ chunk_overlap=self._chunk_overlap,
98
+ min_sentences_per_chunk=self._min_sentences_per_chunk,
99
+ tokenizer=self._tokenizer
100
+ )
101
+
102
+ logger.info(
103
+ f"Initialized HtmlLoader with chunk_size={self._chunk_size}, chunk_overlap={self._chunk_overlap}")
104
+
105
+ def run(self, source: str) -> List[str]:
106
+ """
107
+ Execute the primary operation for the component.
108
+
109
+ This method satisfies the BaseComponent abstract method requirement
110
+ and delegates to load_as_chunks.
111
+
112
+ Args:
113
+ source: Path to HTML file
114
+
115
+ Returns:
116
+ List of text chunks extracted from the HTML file
117
+ """
118
+ return self.load_as_chunks(source)
119
+
120
+ def load_as_text(
121
+ self,
122
+ source: Union[str, Path],
123
+ ) -> str:
124
+ """
125
+ Load HTML and return as a single text string.
126
+
127
+ This method extracts all text from the HTML file and returns it as a single
128
+ string without chunking. Useful when you need the full content.
129
+
130
+ Args:
131
+ source: Path to HTML file
132
+
133
+ Returns:
134
+ Full text content of the HTML as a single string
135
+
136
+ Raises:
137
+ FileNotFoundError: If source file doesn't exist
138
+ ValueError: If source is not an HTML file
139
+ Exception: If HTML processing fails
140
+ """
141
+ # Convert Path to string
142
+ if isinstance(source, Path):
143
+ source = str(source)
144
+
145
+ # Validate file exists
146
+ if not os.path.isfile(source):
147
+ raise FileNotFoundError(f"File not found: {source}")
148
+
149
+ # Validate file is an HTML
150
+ if not self._is_html_file(source):
151
+ raise ValueError(
152
+ f"File is not an HTML: {source}. Extension: {Path(source).suffix}")
153
+
154
+ logger.info(f"Loading HTML as text: {source}")
155
+ start_time = time.time()
156
+
157
+ try:
158
+ # Read and parse HTML
159
+ with open(source, 'r', encoding=self._encoding, errors='replace') as f:
160
+ html_content = f.read()
161
+
162
+ # Extract text from HTML
163
+ full_text = self._extract_text_from_html(html_content)
164
+
165
+ elapsed = time.time() - start_time
166
+ logger.info(
167
+ f"HTML loaded as text in {elapsed:.2f}s: {len(full_text)} characters")
168
+
169
+ return full_text
170
+
171
+ except Exception as e:
172
+ logger.error(f"Error loading HTML as text {source}: {e}")
173
+ raise
174
+
175
+ def load_as_chunks(
176
+ self,
177
+ source: Union[str, Path],
178
+ ) -> List[str]:
179
+ """
180
+ Load HTML and return as a list of text chunks.
181
+
182
+ This method extracts text from the HTML file, processes it with the configured
183
+ chunker, and returns a list of text chunks.
184
+
185
+ Args:
186
+ source: Path to HTML file
187
+
188
+ Returns:
189
+ List of text chunks extracted from the HTML file
190
+
191
+ Raises:
192
+ FileNotFoundError: If source file doesn't exist
193
+ ValueError: If source is not an HTML file
194
+ Exception: If HTML processing fails
195
+ """
196
+ # Convert Path to string
197
+ if isinstance(source, Path):
198
+ source = str(source)
199
+
200
+ # Validate file exists
201
+ if not os.path.isfile(source):
202
+ raise FileNotFoundError(f"File not found: {source}")
203
+
204
+ # Validate file is an HTML
205
+ if not self._is_html_file(source):
206
+ raise ValueError(
207
+ f"File is not an HTML: {source}. Extension: {Path(source).suffix}")
208
+
209
+ logger.info(f"Loading HTML file: {source}")
210
+ start_time = time.time()
211
+
212
+ try:
213
+ # Read and parse HTML
214
+ with open(source, 'r', encoding=self._encoding, errors='replace') as f:
215
+ html_content = f.read()
216
+
217
+ # Extract text from HTML
218
+ full_text = self._extract_text_from_html(html_content)
219
+
220
+ # Chunk the text using TextChunker
221
+ text_chunks = self._chunk_text(full_text)
222
+
223
+ elapsed = time.time() - start_time
224
+ logger.info(
225
+ f"HTML processed in {elapsed:.2f}s: {len(text_chunks)} chunks")
226
+
227
+ return text_chunks
228
+
229
+ except Exception as e:
230
+ logger.error(f"Error processing HTML {source}: {e}")
231
+ raise
232
+
233
+ def load_as_nodes(
234
+ self,
235
+ source: Union[str, Path],
236
+ source_id: Optional[str] = None,
237
+ custom_metadata: Optional[Dict[str, Any]] = None
238
+ ) -> List[Node]:
239
+ """
240
+ Load HTML and return as Node objects with metadata.
241
+
242
+ Args:
243
+ source: Path to HTML file
244
+ source_id: Optional source identifier (defaults to file path)
245
+ custom_metadata: Optional custom metadata to attach to nodes
246
+
247
+ Returns:
248
+ List of Node objects with text chunks and metadata
249
+ """
250
+ # Convert Path to string
251
+ if isinstance(source, Path):
252
+ source = str(source)
253
+
254
+ # Load text chunks
255
+ chunks = self.load_as_chunks(source)
256
+
257
+ # Determine source ID
258
+ if source_id is None:
259
+ source_id = source
260
+
261
+ # Extract HTML metadata if enabled
262
+ html_metadata = {}
263
+ if self._extract_metadata:
264
+ try:
265
+ with open(source, 'r', encoding=self._encoding, errors='replace') as f:
266
+ html_content = f.read()
267
+ html_metadata = self._extract_html_metadata(html_content)
268
+ except Exception as e:
269
+ logger.warning(f"Failed to extract HTML metadata: {e}")
270
+
271
+ # Create nodes with metadata
272
+ nodes = []
273
+ for idx, chunk in enumerate(chunks):
274
+ # Build custom metadata with HTML info
275
+ node_custom = custom_metadata.copy() if custom_metadata else {}
276
+ node_custom.update(html_metadata)
277
+ node_custom['content_type'] = 'html'
278
+
279
+ metadata = NodeMetadata(
280
+ source_file_uuid=source_id,
281
+ position=idx,
282
+ custom=node_custom
283
+ )
284
+ node = Node(content=chunk, metadata=metadata)
285
+ nodes.append(node)
286
+
287
+ logger.info(f"Created {len(nodes)} nodes from HTML: {source}")
288
+ return nodes
289
+
290
+ def load_as_vsfile(
291
+ self,
292
+ file_path: Union[str, Path],
293
+ custom_metadata: Optional[Dict[str, Any]] = None
294
+ ) -> VSFile:
295
+ """
296
+ Load HTML and return as VSFile object.
297
+
298
+ Args:
299
+ file_path: Path to HTML file
300
+ custom_metadata: Optional custom metadata
301
+
302
+ Returns:
303
+ VSFile object with nodes
304
+
305
+ Raises:
306
+ FileNotFoundError: If file doesn't exist
307
+ ValueError: If file is not an HTML
308
+ """
309
+ if isinstance(file_path, Path):
310
+ file_path = str(file_path)
311
+
312
+ if not os.path.isfile(file_path):
313
+ raise FileNotFoundError(f"File not found: {file_path}")
314
+
315
+ if not self._is_html_file(file_path):
316
+ raise ValueError(f"File is not an HTML: {file_path}")
317
+
318
+ # Create VSFile
319
+ vsfile = VSFile(file_path)
320
+
321
+ # Load and create nodes
322
+ nodes = self.load_as_nodes(
323
+ file_path, str(vsfile.uuid), custom_metadata)
324
+ vsfile.nodes = nodes
325
+ vsfile.processed = True
326
+
327
+ logger.info(
328
+ f"Created VSFile with {len(nodes)} nodes from: {file_path}")
329
+ return vsfile
330
+
331
+ def _is_html_file(self, file_path: str) -> bool:
332
+ """
333
+ Check if file is an HTML based on extension.
334
+
335
+ Args:
336
+ file_path: Path to file
337
+
338
+ Returns:
339
+ True if file is an HTML, False otherwise
340
+ """
341
+ path = Path(file_path)
342
+ return path.suffix.lower() in self.SUPPORTED_EXTENSIONS
343
+
344
+ def _extract_text_from_html(self, html_content: str) -> str:
345
+ """
346
+ Extract text from HTML content.
347
+
348
+ Args:
349
+ html_content: Raw HTML content
350
+
351
+ Returns:
352
+ Extracted text content
353
+ """
354
+ try:
355
+ from bs4 import BeautifulSoup
356
+ except ImportError:
357
+ logger.error(
358
+ "beautifulsoup4 is required for HTML support. Install with: pip install beautifulsoup4")
359
+ raise ImportError("beautifulsoup4 is required for HTML support")
360
+
361
+ try:
362
+ soup = BeautifulSoup(html_content, 'html.parser')
363
+
364
+ # Remove unwanted tags
365
+ for tag in self.REMOVE_TAGS:
366
+ for element in soup.find_all(tag):
367
+ element.decompose()
368
+
369
+ # Extract text with structure preservation if enabled
370
+ if self._preserve_structure:
371
+ text = self._extract_with_structure(soup)
372
+ else:
373
+ text = self._extract_plain_text(soup)
374
+
375
+ return text
376
+
377
+ except Exception as e:
378
+ logger.error(f"Failed to extract text from HTML: {e}")
379
+ raise
380
+
381
+ def _extract_with_structure(self, soup) -> str:
382
+ """
383
+ Extract text while preserving semantic structure.
384
+
385
+ Args:
386
+ soup: BeautifulSoup object
387
+
388
+ Returns:
389
+ Extracted text with structure hints
390
+ """
391
+ text_parts = []
392
+
393
+ # Get title if present
394
+ title = soup.find('title')
395
+ if title and title.string:
396
+ text_parts.append(f"Title: {title.string.strip()}")
397
+ text_parts.append("")
398
+
399
+ # Process body content
400
+ body = soup.find('body') or soup
401
+
402
+ for element in body.descendants:
403
+ if element.name in self.SECTION_TAGS:
404
+ # Add section header
405
+ header_text = element.get_text(strip=True)
406
+ if header_text:
407
+ # Add visual separator for headings
408
+ if element.name.startswith('h') and len(element.name) == 2:
409
+ level = int(element.name[1])
410
+ prefix = '#' * level
411
+ text_parts.append(f"\n{prefix} {header_text}")
412
+ else:
413
+ text_parts.append(f"\n[{element.name.upper()}]")
414
+ text_parts.append(header_text)
415
+
416
+ elif element.name == 'p':
417
+ para_text = element.get_text(strip=True)
418
+ if para_text:
419
+ text_parts.append(para_text)
420
+ text_parts.append("")
421
+
422
+ elif element.name in ['li']:
423
+ li_text = element.get_text(strip=True)
424
+ if li_text:
425
+ text_parts.append(f"• {li_text}")
426
+
427
+ elif element.name == 'a' and self._preserve_links:
428
+ link_text = element.get_text(strip=True)
429
+ href = element.get('href', '')
430
+ if link_text and href:
431
+ text_parts.append(f"[{link_text}]({href})")
432
+
433
+ elif element.name == 'table':
434
+ table_text = self._extract_table(element)
435
+ if table_text:
436
+ text_parts.append(table_text)
437
+ text_parts.append("")
438
+
439
+ # If no structured content found, fall back to plain text
440
+ if not text_parts:
441
+ return self._extract_plain_text(soup)
442
+
443
+ # Clean and join text
444
+ full_text = '\n'.join(text_parts)
445
+
446
+ # Clean up excessive whitespace
447
+ full_text = re.sub(r'\n{3,}', '\n\n', full_text)
448
+ full_text = re.sub(r' {2,}', ' ', full_text)
449
+
450
+ return full_text.strip()
451
+
452
+ def _extract_plain_text(self, soup) -> str:
453
+ """
454
+ Extract plain text from HTML.
455
+
456
+ Args:
457
+ soup: BeautifulSoup object
458
+
459
+ Returns:
460
+ Plain text content
461
+ """
462
+ # Get text
463
+ text = soup.get_text()
464
+
465
+ # Clean up whitespace
466
+ lines = (line.strip() for line in text.splitlines())
467
+ chunks = (phrase.strip()
468
+ for line in lines for phrase in line.split(" "))
469
+ text = '\n'.join(chunk for chunk in chunks if chunk)
470
+
471
+ return text
472
+
473
+ def _extract_table(self, table_element) -> str:
474
+ """
475
+ Extract text from table element.
476
+
477
+ Args:
478
+ table_element: BeautifulSoup table element
479
+
480
+ Returns:
481
+ Formatted table text
482
+ """
483
+ rows = []
484
+
485
+ for row in table_element.find_all('tr'):
486
+ cells = []
487
+ for cell in row.find_all(['th', 'td']):
488
+ cell_text = cell.get_text(strip=True)
489
+ cells.append(cell_text)
490
+
491
+ if cells:
492
+ rows.append(' | '.join(cells))
493
+
494
+ return '\n'.join(rows)
495
+
496
+ def _extract_html_metadata(self, html_content: str) -> Dict[str, Any]:
497
+ """
498
+ Extract metadata from HTML (title, description, etc.).
499
+
500
+ Args:
501
+ html_content: Raw HTML content
502
+
503
+ Returns:
504
+ Dictionary of metadata
505
+ """
506
+ try:
507
+ from bs4 import BeautifulSoup
508
+ except ImportError:
509
+ return {}
510
+
511
+ try:
512
+ soup = BeautifulSoup(html_content, 'html.parser')
513
+ metadata = {}
514
+
515
+ # Extract title
516
+ title = soup.find('title')
517
+ if title and title.string:
518
+ metadata['title'] = title.string.strip()
519
+
520
+ # Extract meta tags
521
+ for meta in soup.find_all('meta'):
522
+ name = meta.get('name', '').lower()
523
+ content = meta.get('content', '')
524
+
525
+ if name == 'description' and content:
526
+ metadata['description'] = content
527
+ elif name == 'keywords' and content:
528
+ metadata['keywords'] = content
529
+ elif name == 'author' and content:
530
+ metadata['author'] = content
531
+
532
+ # Extract Open Graph metadata
533
+ for meta in soup.find_all('meta', property=True):
534
+ prop = meta.get('property', '').lower()
535
+ content = meta.get('content', '')
536
+
537
+ if prop.startswith('og:') and content:
538
+ key = prop.replace('og:', 'og_')
539
+ metadata[key] = content
540
+
541
+ return metadata
542
+
543
+ except Exception as e:
544
+ logger.warning(f"Failed to extract HTML metadata: {e}")
545
+ return {}
546
+
547
+ def _chunk_text(self, text: str) -> List[str]:
548
+ """
549
+ Chunk text using TextChunker.
550
+
551
+ Args:
552
+ text: Full text to chunk
553
+
554
+ Returns:
555
+ List of text chunks
556
+ """
557
+ if not text or not text.strip():
558
+ return []
559
+
560
+ try:
561
+ # Use TextChunker's chunk_text method
562
+ chunk_dicts = self._chunker.chunk_text(text, context="html")
563
+
564
+ # Extract just the text from the chunk dictionaries
565
+ text_chunks = [chunk_dict['text'] for chunk_dict in chunk_dicts]
566
+
567
+ logger.debug(f"Chunked HTML text into {len(text_chunks)} chunks")
568
+ return text_chunks
569
+
570
+ except Exception as e:
571
+ logger.warning(f"Failed to chunk text with TextChunker: {e}")
572
+ # Fall back to returning the whole text as a single chunk
573
+ logger.info("Falling back to single chunk")
574
+ return [text]
575
+
576
+
577
+ def create_html_loader(
578
+ chunk_size: int = 3000,
579
+ chunk_overlap: int = 200,
580
+ min_sentences_per_chunk: int = 5,
581
+ tokenizer: str = "character",
582
+ extract_metadata: bool = True,
583
+ preserve_links: bool = False,
584
+ preserve_structure: bool = True,
585
+ encoding: str = 'utf-8'
586
+ ) -> HtmlLoader:
587
+ """
588
+ Factory function to create an HTML loader.
589
+
590
+ Args:
591
+ chunk_size: Maximum tokens per chunk (default: 3000)
592
+ chunk_overlap: Overlap between chunks in tokens (default: 200)
593
+ min_sentences_per_chunk: Minimum sentences per chunk (default: 5)
594
+ tokenizer: Tokenizer for chunking - "character", "gpt2", or HuggingFace model (default: "character")
595
+ extract_metadata: Whether to extract meta tags (default: True)
596
+ preserve_links: Whether to preserve link references in output (default: False)
597
+ preserve_structure: Whether to preserve HTML structure hints (default: True)
598
+ encoding: File encoding (default: "utf-8")
599
+
600
+ Returns:
601
+ Configured HTML loader
602
+
603
+ Example:
604
+ >>> loader = create_html_loader(chunk_size=1024, chunk_overlap=64)
605
+ >>> chunks = loader.run("page.html")
606
+ >>> print(f"Extracted {len(chunks)} chunks")
607
+
608
+ >>> # Create loader with link preservation
609
+ >>> loader = create_html_loader(preserve_links=True)
610
+ >>> chunks = loader.run("page.html")
611
+ """
612
+ config = {
613
+ 'chunk_size': chunk_size,
614
+ 'chunk_overlap': chunk_overlap,
615
+ 'min_sentences_per_chunk': min_sentences_per_chunk,
616
+ 'tokenizer': tokenizer,
617
+ 'extract_metadata': extract_metadata,
618
+ 'preserve_links': preserve_links,
619
+ 'preserve_structure': preserve_structure,
620
+ 'encoding': encoding
621
+ }
622
+
623
+ return HtmlLoader(config=config)
624
+
625
+
626
+ __all__ = ["HtmlLoader", "create_html_loader"]