mseep-txtai 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (251) hide show
  1. mseep_txtai-9.1.1.dist-info/METADATA +262 -0
  2. mseep_txtai-9.1.1.dist-info/RECORD +251 -0
  3. mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
  4. mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
  5. mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
  6. txtai/__init__.py +16 -0
  7. txtai/agent/__init__.py +12 -0
  8. txtai/agent/base.py +54 -0
  9. txtai/agent/factory.py +39 -0
  10. txtai/agent/model.py +107 -0
  11. txtai/agent/placeholder.py +16 -0
  12. txtai/agent/tool/__init__.py +7 -0
  13. txtai/agent/tool/embeddings.py +69 -0
  14. txtai/agent/tool/factory.py +130 -0
  15. txtai/agent/tool/function.py +49 -0
  16. txtai/ann/__init__.py +7 -0
  17. txtai/ann/base.py +153 -0
  18. txtai/ann/dense/__init__.py +11 -0
  19. txtai/ann/dense/annoy.py +72 -0
  20. txtai/ann/dense/factory.py +76 -0
  21. txtai/ann/dense/faiss.py +233 -0
  22. txtai/ann/dense/hnsw.py +104 -0
  23. txtai/ann/dense/numpy.py +164 -0
  24. txtai/ann/dense/pgvector.py +323 -0
  25. txtai/ann/dense/sqlite.py +303 -0
  26. txtai/ann/dense/torch.py +38 -0
  27. txtai/ann/sparse/__init__.py +7 -0
  28. txtai/ann/sparse/factory.py +61 -0
  29. txtai/ann/sparse/ivfsparse.py +377 -0
  30. txtai/ann/sparse/pgsparse.py +56 -0
  31. txtai/api/__init__.py +18 -0
  32. txtai/api/application.py +134 -0
  33. txtai/api/authorization.py +53 -0
  34. txtai/api/base.py +159 -0
  35. txtai/api/cluster.py +295 -0
  36. txtai/api/extension.py +19 -0
  37. txtai/api/factory.py +40 -0
  38. txtai/api/responses/__init__.py +7 -0
  39. txtai/api/responses/factory.py +30 -0
  40. txtai/api/responses/json.py +56 -0
  41. txtai/api/responses/messagepack.py +51 -0
  42. txtai/api/route.py +41 -0
  43. txtai/api/routers/__init__.py +25 -0
  44. txtai/api/routers/agent.py +38 -0
  45. txtai/api/routers/caption.py +42 -0
  46. txtai/api/routers/embeddings.py +280 -0
  47. txtai/api/routers/entity.py +42 -0
  48. txtai/api/routers/extractor.py +28 -0
  49. txtai/api/routers/labels.py +47 -0
  50. txtai/api/routers/llm.py +61 -0
  51. txtai/api/routers/objects.py +42 -0
  52. txtai/api/routers/openai.py +191 -0
  53. txtai/api/routers/rag.py +61 -0
  54. txtai/api/routers/reranker.py +46 -0
  55. txtai/api/routers/segmentation.py +42 -0
  56. txtai/api/routers/similarity.py +48 -0
  57. txtai/api/routers/summary.py +46 -0
  58. txtai/api/routers/tabular.py +42 -0
  59. txtai/api/routers/textractor.py +42 -0
  60. txtai/api/routers/texttospeech.py +33 -0
  61. txtai/api/routers/transcription.py +42 -0
  62. txtai/api/routers/translation.py +46 -0
  63. txtai/api/routers/upload.py +36 -0
  64. txtai/api/routers/workflow.py +28 -0
  65. txtai/app/__init__.py +5 -0
  66. txtai/app/base.py +821 -0
  67. txtai/archive/__init__.py +9 -0
  68. txtai/archive/base.py +104 -0
  69. txtai/archive/compress.py +51 -0
  70. txtai/archive/factory.py +25 -0
  71. txtai/archive/tar.py +49 -0
  72. txtai/archive/zip.py +35 -0
  73. txtai/cloud/__init__.py +8 -0
  74. txtai/cloud/base.py +106 -0
  75. txtai/cloud/factory.py +70 -0
  76. txtai/cloud/hub.py +101 -0
  77. txtai/cloud/storage.py +125 -0
  78. txtai/console/__init__.py +5 -0
  79. txtai/console/__main__.py +22 -0
  80. txtai/console/base.py +264 -0
  81. txtai/data/__init__.py +10 -0
  82. txtai/data/base.py +138 -0
  83. txtai/data/labels.py +42 -0
  84. txtai/data/questions.py +135 -0
  85. txtai/data/sequences.py +48 -0
  86. txtai/data/texts.py +68 -0
  87. txtai/data/tokens.py +28 -0
  88. txtai/database/__init__.py +14 -0
  89. txtai/database/base.py +342 -0
  90. txtai/database/client.py +227 -0
  91. txtai/database/duckdb.py +150 -0
  92. txtai/database/embedded.py +76 -0
  93. txtai/database/encoder/__init__.py +8 -0
  94. txtai/database/encoder/base.py +37 -0
  95. txtai/database/encoder/factory.py +56 -0
  96. txtai/database/encoder/image.py +43 -0
  97. txtai/database/encoder/serialize.py +28 -0
  98. txtai/database/factory.py +77 -0
  99. txtai/database/rdbms.py +569 -0
  100. txtai/database/schema/__init__.py +6 -0
  101. txtai/database/schema/orm.py +99 -0
  102. txtai/database/schema/statement.py +98 -0
  103. txtai/database/sql/__init__.py +8 -0
  104. txtai/database/sql/aggregate.py +178 -0
  105. txtai/database/sql/base.py +189 -0
  106. txtai/database/sql/expression.py +404 -0
  107. txtai/database/sql/token.py +342 -0
  108. txtai/database/sqlite.py +57 -0
  109. txtai/embeddings/__init__.py +7 -0
  110. txtai/embeddings/base.py +1107 -0
  111. txtai/embeddings/index/__init__.py +14 -0
  112. txtai/embeddings/index/action.py +15 -0
  113. txtai/embeddings/index/autoid.py +92 -0
  114. txtai/embeddings/index/configuration.py +71 -0
  115. txtai/embeddings/index/documents.py +86 -0
  116. txtai/embeddings/index/functions.py +155 -0
  117. txtai/embeddings/index/indexes.py +199 -0
  118. txtai/embeddings/index/indexids.py +60 -0
  119. txtai/embeddings/index/reducer.py +104 -0
  120. txtai/embeddings/index/stream.py +67 -0
  121. txtai/embeddings/index/transform.py +205 -0
  122. txtai/embeddings/search/__init__.py +11 -0
  123. txtai/embeddings/search/base.py +344 -0
  124. txtai/embeddings/search/errors.py +9 -0
  125. txtai/embeddings/search/explain.py +120 -0
  126. txtai/embeddings/search/ids.py +61 -0
  127. txtai/embeddings/search/query.py +69 -0
  128. txtai/embeddings/search/scan.py +196 -0
  129. txtai/embeddings/search/terms.py +46 -0
  130. txtai/graph/__init__.py +10 -0
  131. txtai/graph/base.py +769 -0
  132. txtai/graph/factory.py +61 -0
  133. txtai/graph/networkx.py +275 -0
  134. txtai/graph/query.py +181 -0
  135. txtai/graph/rdbms.py +113 -0
  136. txtai/graph/topics.py +166 -0
  137. txtai/models/__init__.py +9 -0
  138. txtai/models/models.py +268 -0
  139. txtai/models/onnx.py +133 -0
  140. txtai/models/pooling/__init__.py +9 -0
  141. txtai/models/pooling/base.py +141 -0
  142. txtai/models/pooling/cls.py +28 -0
  143. txtai/models/pooling/factory.py +144 -0
  144. txtai/models/pooling/late.py +173 -0
  145. txtai/models/pooling/mean.py +33 -0
  146. txtai/models/pooling/muvera.py +164 -0
  147. txtai/models/registry.py +37 -0
  148. txtai/models/tokendetection.py +122 -0
  149. txtai/pipeline/__init__.py +17 -0
  150. txtai/pipeline/audio/__init__.py +11 -0
  151. txtai/pipeline/audio/audiomixer.py +58 -0
  152. txtai/pipeline/audio/audiostream.py +94 -0
  153. txtai/pipeline/audio/microphone.py +244 -0
  154. txtai/pipeline/audio/signal.py +186 -0
  155. txtai/pipeline/audio/texttoaudio.py +60 -0
  156. txtai/pipeline/audio/texttospeech.py +553 -0
  157. txtai/pipeline/audio/transcription.py +212 -0
  158. txtai/pipeline/base.py +23 -0
  159. txtai/pipeline/data/__init__.py +10 -0
  160. txtai/pipeline/data/filetohtml.py +206 -0
  161. txtai/pipeline/data/htmltomd.py +414 -0
  162. txtai/pipeline/data/segmentation.py +178 -0
  163. txtai/pipeline/data/tabular.py +155 -0
  164. txtai/pipeline/data/textractor.py +139 -0
  165. txtai/pipeline/data/tokenizer.py +112 -0
  166. txtai/pipeline/factory.py +77 -0
  167. txtai/pipeline/hfmodel.py +111 -0
  168. txtai/pipeline/hfpipeline.py +96 -0
  169. txtai/pipeline/image/__init__.py +7 -0
  170. txtai/pipeline/image/caption.py +55 -0
  171. txtai/pipeline/image/imagehash.py +90 -0
  172. txtai/pipeline/image/objects.py +80 -0
  173. txtai/pipeline/llm/__init__.py +11 -0
  174. txtai/pipeline/llm/factory.py +86 -0
  175. txtai/pipeline/llm/generation.py +173 -0
  176. txtai/pipeline/llm/huggingface.py +218 -0
  177. txtai/pipeline/llm/litellm.py +90 -0
  178. txtai/pipeline/llm/llama.py +152 -0
  179. txtai/pipeline/llm/llm.py +75 -0
  180. txtai/pipeline/llm/rag.py +477 -0
  181. txtai/pipeline/nop.py +14 -0
  182. txtai/pipeline/tensors.py +52 -0
  183. txtai/pipeline/text/__init__.py +13 -0
  184. txtai/pipeline/text/crossencoder.py +70 -0
  185. txtai/pipeline/text/entity.py +140 -0
  186. txtai/pipeline/text/labels.py +137 -0
  187. txtai/pipeline/text/lateencoder.py +103 -0
  188. txtai/pipeline/text/questions.py +48 -0
  189. txtai/pipeline/text/reranker.py +57 -0
  190. txtai/pipeline/text/similarity.py +83 -0
  191. txtai/pipeline/text/summary.py +98 -0
  192. txtai/pipeline/text/translation.py +298 -0
  193. txtai/pipeline/train/__init__.py +7 -0
  194. txtai/pipeline/train/hfonnx.py +196 -0
  195. txtai/pipeline/train/hftrainer.py +398 -0
  196. txtai/pipeline/train/mlonnx.py +63 -0
  197. txtai/scoring/__init__.py +12 -0
  198. txtai/scoring/base.py +188 -0
  199. txtai/scoring/bm25.py +29 -0
  200. txtai/scoring/factory.py +95 -0
  201. txtai/scoring/pgtext.py +181 -0
  202. txtai/scoring/sif.py +32 -0
  203. txtai/scoring/sparse.py +218 -0
  204. txtai/scoring/terms.py +499 -0
  205. txtai/scoring/tfidf.py +358 -0
  206. txtai/serialize/__init__.py +10 -0
  207. txtai/serialize/base.py +85 -0
  208. txtai/serialize/errors.py +9 -0
  209. txtai/serialize/factory.py +29 -0
  210. txtai/serialize/messagepack.py +42 -0
  211. txtai/serialize/pickle.py +98 -0
  212. txtai/serialize/serializer.py +46 -0
  213. txtai/util/__init__.py +7 -0
  214. txtai/util/resolver.py +32 -0
  215. txtai/util/sparsearray.py +62 -0
  216. txtai/util/template.py +16 -0
  217. txtai/vectors/__init__.py +8 -0
  218. txtai/vectors/base.py +476 -0
  219. txtai/vectors/dense/__init__.py +12 -0
  220. txtai/vectors/dense/external.py +55 -0
  221. txtai/vectors/dense/factory.py +121 -0
  222. txtai/vectors/dense/huggingface.py +44 -0
  223. txtai/vectors/dense/litellm.py +86 -0
  224. txtai/vectors/dense/llama.py +84 -0
  225. txtai/vectors/dense/m2v.py +67 -0
  226. txtai/vectors/dense/sbert.py +92 -0
  227. txtai/vectors/dense/words.py +211 -0
  228. txtai/vectors/recovery.py +57 -0
  229. txtai/vectors/sparse/__init__.py +7 -0
  230. txtai/vectors/sparse/base.py +90 -0
  231. txtai/vectors/sparse/factory.py +55 -0
  232. txtai/vectors/sparse/sbert.py +34 -0
  233. txtai/version.py +6 -0
  234. txtai/workflow/__init__.py +8 -0
  235. txtai/workflow/base.py +184 -0
  236. txtai/workflow/execute.py +99 -0
  237. txtai/workflow/factory.py +42 -0
  238. txtai/workflow/task/__init__.py +18 -0
  239. txtai/workflow/task/base.py +490 -0
  240. txtai/workflow/task/console.py +24 -0
  241. txtai/workflow/task/export.py +64 -0
  242. txtai/workflow/task/factory.py +89 -0
  243. txtai/workflow/task/file.py +28 -0
  244. txtai/workflow/task/image.py +36 -0
  245. txtai/workflow/task/retrieve.py +61 -0
  246. txtai/workflow/task/service.py +102 -0
  247. txtai/workflow/task/storage.py +110 -0
  248. txtai/workflow/task/stream.py +33 -0
  249. txtai/workflow/task/template.py +116 -0
  250. txtai/workflow/task/url.py +20 -0
  251. txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,212 @@
1
+ """
2
+ Transcription module
3
+ """
4
+
5
+ import numpy as np
6
+
7
+ # Conditional import
8
+ try:
9
+ import soundfile as sf
10
+
11
+ from .signal import Signal, SCIPY
12
+
13
+ TRANSCRIPTION = SCIPY
14
+ except (ImportError, OSError):
15
+ TRANSCRIPTION = False
16
+
17
+ from ..hfpipeline import HFPipeline
18
+
19
+
20
+ class Transcription(HFPipeline):
21
+ """
22
+ Transcribes audio files or data to text.
23
+ """
24
+
25
+ def __init__(self, path=None, quantize=False, gpu=True, model=None, **kwargs):
26
+ if not TRANSCRIPTION:
27
+ raise ImportError(
28
+ 'Transcription pipeline is not available - install "pipeline" extra to enable. Also check that libsndfile is available.'
29
+ )
30
+
31
+ # Call parent constructor
32
+ super().__init__("automatic-speech-recognition", path, quantize, gpu, model, **kwargs)
33
+
34
+ def __call__(self, audio, rate=None, chunk=10, join=True, **kwargs):
35
+ """
36
+ Transcribes audio files or data to text.
37
+
38
+ This method supports a single audio element or a list of audio. If the input is audio, the return
39
+ type is a string. If text is a list, a list of strings is returned
40
+
41
+ Args:
42
+ audio: audio|list
43
+ rate: sample rate, only required with raw audio data
44
+ chunk: process audio in chunk second sized segments
45
+ join: if True (default), combine each chunk back together into a single text output.
46
+ When False, chunks are returned as a list of dicts, each having raw associated audio and
47
+ sample rate in addition to text
48
+ kwargs: generate keyword arguments
49
+
50
+ Returns:
51
+ list of transcribed text
52
+ """
53
+
54
+ # Convert single element to list
55
+ values = [audio] if self.isaudio(audio) else audio
56
+
57
+ # Read input audio
58
+ speech = self.read(values, rate)
59
+
60
+ # Apply transformation rules and store results
61
+ results = self.batchprocess(speech, chunk, **kwargs) if chunk and not join else self.process(speech, chunk, **kwargs)
62
+
63
+ # Return single element if single element passed in
64
+ return results[0] if self.isaudio(audio) else results
65
+
66
+ def isaudio(self, audio):
67
+ """
68
+ Checks if input is a single audio element.
69
+
70
+ Args:
71
+ audio: audio|list
72
+
73
+ Returns:
74
+ True if input is an audio element, False otherwise
75
+ """
76
+
77
+ return isinstance(audio, (str, tuple, np.ndarray)) or hasattr(audio, "read")
78
+
79
+ def read(self, audio, rate):
80
+ """
81
+ Read audio to raw waveforms and sample rates.
82
+
83
+ Args:
84
+ audio: audio|list
85
+ rate: optional sample rate
86
+
87
+ Returns:
88
+ list of (audio data, sample rate)
89
+ """
90
+
91
+ speech = []
92
+ for x in audio:
93
+ if isinstance(x, str) or hasattr(x, "read"):
94
+ # Read file or file-like object
95
+ raw, samplerate = sf.read(x)
96
+ elif isinstance(x, tuple):
97
+ # Input is NumPy array and sample rate
98
+ raw, samplerate = x
99
+ else:
100
+ # Input is NumPy array
101
+ raw, samplerate = x, rate
102
+
103
+ speech.append((raw, samplerate))
104
+
105
+ return speech
106
+
107
+ def process(self, speech, chunk, **kwargs):
108
+ """
109
+ Standard processing loop. Runs a single pipeline call for all speech inputs along
110
+ with the chunk size. Returns text for each input.
111
+
112
+ Args:
113
+ speech: list of (audio data, sample rate)
114
+ chunk: split audio into chunk seconds sized segments for processing
115
+ kwargs: generate keyword arguments
116
+
117
+ Returns:
118
+ list of transcribed text
119
+ """
120
+
121
+ results = []
122
+ for result in self.pipeline([self.convert(*x) for x in speech], chunk_length_s=chunk, ignore_warning=True, generate_kwargs=kwargs):
123
+ # Store result
124
+ results.append(self.clean(result["text"]))
125
+
126
+ return results
127
+
128
+ def batchprocess(self, speech, chunk, **kwargs):
129
+ """
130
+ Batch processing loop. Runs a pipeline call per speech input. Each speech input is split
131
+ into chunk duration segments. Each segment is individually transcribed and returned along with
132
+ the raw wav snippets.
133
+
134
+ Args:
135
+ speech: list of (audio data, sample rate)
136
+ chunk: split audio into chunk seconds sized segments for processing
137
+ kwargs: generate keyword arguments
138
+
139
+ Returns:
140
+ list of lists of dicts - each dict has text, raw wav data for text and sample rate
141
+ """
142
+
143
+ results = []
144
+
145
+ # Process each element individually to get time-sliced chunks
146
+ for raw, rate in speech:
147
+ # Get segments for current speech entry
148
+ segments = self.segments(raw, rate, chunk)
149
+
150
+ # Process segments, store raw data before processing given pipeline modifies it
151
+ sresults = []
152
+ for x, result in enumerate(self.pipeline([self.convert(*x) for x in segments], generate_kwargs=kwargs)):
153
+ sresults.append({"text": self.clean(result["text"]), "raw": segments[x][0], "rate": segments[x][1]})
154
+
155
+ results.append(sresults)
156
+
157
+ return results
158
+
159
+ def segments(self, raw, rate, chunk):
160
+ """
161
+ Builds chunk duration batches.
162
+
163
+ Args:
164
+ raw: raw audio data
165
+ rate: sample rate
166
+ chunk: chunk duration size
167
+ """
168
+
169
+ segments = []
170
+
171
+ # Split into batches, use sample rate * chunk seconds
172
+ for segment in self.batch(raw, rate * chunk):
173
+ segments.append((segment, rate))
174
+
175
+ return segments
176
+
177
+ def convert(self, raw, rate):
178
+ """
179
+ Converts input audio to mono with a sample rate equal to the pipeline model's
180
+ sample rate.
181
+
182
+ Args:
183
+ raw: raw audio data
184
+ rate: target sample rate
185
+
186
+ Returns:
187
+ audio data ready for pipeline model
188
+ """
189
+
190
+ # Convert stereo to mono, if necessary
191
+ raw = Signal.mono(raw)
192
+
193
+ # Resample to target sample rate
194
+ target = self.pipeline.feature_extractor.sampling_rate
195
+ return {"raw": Signal.resample(raw, rate, target), "sampling_rate": target}
196
+
197
+ def clean(self, text):
198
+ """
199
+ Applies text normalization rules.
200
+
201
+ Args:
202
+ text: input text
203
+
204
+ Returns:
205
+ clean text
206
+ """
207
+
208
+ # Trim whitespace
209
+ text = text.strip()
210
+
211
+ # Convert all upper case strings to capitalized case
212
+ return text.capitalize() if text.isupper() else text
txtai/pipeline/base.py ADDED
@@ -0,0 +1,23 @@
1
+ """
2
+ Pipeline module
3
+ """
4
+
5
+
6
+ class Pipeline:
7
+ """
8
+ Base class for all Pipelines. The only interface requirement is to define a __call___ method.
9
+ """
10
+
11
+ def batch(self, data, size):
12
+ """
13
+ Splits data into separate batch sizes specified by size.
14
+
15
+ Args:
16
+ data: data elements
17
+ size: batch size
18
+
19
+ Returns:
20
+ list of evenly sized batches with the last batch having the remaining elements
21
+ """
22
+
23
+ return [data[x : x + size] for x in range(0, len(data), size)]
@@ -0,0 +1,10 @@
1
+ """
2
+ Segment imports
3
+ """
4
+
5
+ from .filetohtml import FileToHTML
6
+ from .htmltomd import HTMLToMarkdown
7
+ from .segmentation import Segmentation
8
+ from .tabular import Tabular
9
+ from .textractor import Textractor
10
+ from .tokenizer import Tokenizer
@@ -0,0 +1,206 @@
1
+ """
2
+ FileToHTML module
3
+ """
4
+
5
+ import os
6
+ import re
7
+
8
+ from subprocess import Popen
9
+
10
+ # Conditional import
11
+ try:
12
+ from tika import detector, parser
13
+
14
+ TIKA = True
15
+ except ImportError:
16
+ TIKA = False
17
+
18
+ # Conditional import
19
+ try:
20
+ from docling.document_converter import DocumentConverter
21
+
22
+ DOCLING = True
23
+ except ImportError:
24
+ DOCLING = False
25
+
26
+ from ..base import Pipeline
27
+
28
+
29
+ class FileToHTML(Pipeline):
30
+ """
31
+ File to HTML pipeline.
32
+ """
33
+
34
+ def __init__(self, backend="available"):
35
+ """
36
+ Creates a new File to HTML pipeline.
37
+
38
+ Args:
39
+ backend: backend to use to extract content, supports "tika", "docling" or "available" (default) which finds the first available
40
+ """
41
+
42
+ # Lowercase backend parameter
43
+ backend = backend.lower() if backend else None
44
+
45
+ # Check for available backend
46
+ if backend == "available":
47
+ backend = "tika" if Tika.available() else "docling" if Docling.available() else None
48
+
49
+ # Create backend instance
50
+ self.backend = Tika() if backend == "tika" else Docling() if backend == "docling" else None
51
+
52
+ def __call__(self, path):
53
+ """
54
+ Converts file at path to HTML. Returns None if no backend is available.
55
+
56
+ Args:
57
+ path: input file path
58
+
59
+ Returns:
60
+ html if a backend is available, otherwise returns None
61
+ """
62
+
63
+ return self.backend(path) if self.backend else None
64
+
65
+
66
+ class Tika:
67
+ """
68
+ File to HTML conversion via Apache Tika.
69
+ """
70
+
71
+ @staticmethod
72
+ def available():
73
+ """
74
+ Checks if a Java executable is available and Tika is installed.
75
+
76
+ Returns:
77
+ True if Java is available and Tika is installed, False otherwise
78
+ """
79
+
80
+ # Get path to Java executable
81
+ path = os.environ.get("TIKA_JAVA", "java")
82
+
83
+ # pylint: disable=R1732,W0702,W1514
84
+ # Check if Java binary is available on path
85
+ try:
86
+ _ = Popen(path, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"))
87
+ except:
88
+ return False
89
+
90
+ # Return True if Java is available AND Tika is installed
91
+ return TIKA
92
+
93
+ def __init__(self):
94
+ """
95
+ Creates a new Tika instance.
96
+ """
97
+
98
+ if not Tika.available():
99
+ raise ImportError('Tika engine is not available - install "pipeline" extra to enable. Also check that Java is available.')
100
+
101
+ def __call__(self, path):
102
+ """
103
+ Parses content to HTML.
104
+
105
+ Args:
106
+ path: file path
107
+
108
+ Returns:
109
+ html
110
+ """
111
+
112
+ # Skip parsing if input is plain text or HTML
113
+ mimetype = detector.from_file(path)
114
+ if mimetype in ("text/plain", "text/html", "text/xhtml"):
115
+ return None
116
+
117
+ # Parse content to HTML
118
+ parsed = parser.from_file(path, xmlContent=True)
119
+ return parsed["content"]
120
+
121
+
122
+ class Docling:
123
+ """
124
+ File to HTML conversion via Docling.
125
+ """
126
+
127
+ @staticmethod
128
+ def available():
129
+ """
130
+ Checks if Docling is available.
131
+
132
+ Returns:
133
+ True if Docling is available, False otherwise
134
+ """
135
+
136
+ return DOCLING
137
+
138
+ def __init__(self):
139
+ """
140
+ Creates a new Docling instance.
141
+ """
142
+
143
+ if not Docling.available():
144
+ raise ImportError('Docling engine is not available - install "pipeline" extra to enable')
145
+
146
+ self.converter = DocumentConverter()
147
+
148
+ def __call__(self, path):
149
+ """
150
+ Parses content to HTML.
151
+
152
+ Args:
153
+ path: file path
154
+
155
+ Returns:
156
+ html
157
+ """
158
+
159
+ # Skip parsing if input is HTML
160
+ if self.ishtml(path):
161
+ return None
162
+
163
+ # Parse content to HTML
164
+ html = self.converter.convert(path).document.export_to_html(html_head="<head/>")
165
+
166
+ # Normalize HTML and return
167
+ return self.normalize(html)
168
+
169
+ def ishtml(self, path):
170
+ """
171
+ Detects if this file looks like HTML.
172
+
173
+ Args:
174
+ path: file path
175
+
176
+ Returns:
177
+ True if this is HTML
178
+ """
179
+
180
+ with open(path, "rb") as f:
181
+ # Read first 1024 bytes, ignore encoding errors and strip leading/trailing whitespace
182
+ content = f.read(1024)
183
+ content = content.decode("ascii", errors="ignore").lower().strip()
184
+
185
+ # Check for HTML
186
+ return re.search(r"<!doctype\s+html|<html|<head|<body", content)
187
+
188
+ def normalize(self, html):
189
+ """
190
+ Applies normalization rules to make HTML consistent with other text extraction backends.
191
+
192
+ Args:
193
+ html: input html
194
+
195
+ Returns:
196
+ normalized html
197
+ """
198
+
199
+ # Wrap content with a body tag, if necessary
200
+ html = html.replace("<head/>", "<head/><body>").replace("</html>", "</body></html>") if "<body>" not in html else html
201
+
202
+ # Remove bullets from list items
203
+ html = re.sub(r"<li>\xb7 ", r"<li>", html)
204
+
205
+ # Add spacing between paragraphs
206
+ return html.replace("</p>", "</p><p/>")