mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,212 @@
|
|
1
|
+
"""
|
2
|
+
Transcription module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
# Conditional import
|
8
|
+
try:
|
9
|
+
import soundfile as sf
|
10
|
+
|
11
|
+
from .signal import Signal, SCIPY
|
12
|
+
|
13
|
+
TRANSCRIPTION = SCIPY
|
14
|
+
except (ImportError, OSError):
|
15
|
+
TRANSCRIPTION = False
|
16
|
+
|
17
|
+
from ..hfpipeline import HFPipeline
|
18
|
+
|
19
|
+
|
20
|
+
class Transcription(HFPipeline):
|
21
|
+
"""
|
22
|
+
Transcribes audio files or data to text.
|
23
|
+
"""
|
24
|
+
|
25
|
+
def __init__(self, path=None, quantize=False, gpu=True, model=None, **kwargs):
|
26
|
+
if not TRANSCRIPTION:
|
27
|
+
raise ImportError(
|
28
|
+
'Transcription pipeline is not available - install "pipeline" extra to enable. Also check that libsndfile is available.'
|
29
|
+
)
|
30
|
+
|
31
|
+
# Call parent constructor
|
32
|
+
super().__init__("automatic-speech-recognition", path, quantize, gpu, model, **kwargs)
|
33
|
+
|
34
|
+
def __call__(self, audio, rate=None, chunk=10, join=True, **kwargs):
|
35
|
+
"""
|
36
|
+
Transcribes audio files or data to text.
|
37
|
+
|
38
|
+
This method supports a single audio element or a list of audio. If the input is audio, the return
|
39
|
+
type is a string. If text is a list, a list of strings is returned
|
40
|
+
|
41
|
+
Args:
|
42
|
+
audio: audio|list
|
43
|
+
rate: sample rate, only required with raw audio data
|
44
|
+
chunk: process audio in chunk second sized segments
|
45
|
+
join: if True (default), combine each chunk back together into a single text output.
|
46
|
+
When False, chunks are returned as a list of dicts, each having raw associated audio and
|
47
|
+
sample rate in addition to text
|
48
|
+
kwargs: generate keyword arguments
|
49
|
+
|
50
|
+
Returns:
|
51
|
+
list of transcribed text
|
52
|
+
"""
|
53
|
+
|
54
|
+
# Convert single element to list
|
55
|
+
values = [audio] if self.isaudio(audio) else audio
|
56
|
+
|
57
|
+
# Read input audio
|
58
|
+
speech = self.read(values, rate)
|
59
|
+
|
60
|
+
# Apply transformation rules and store results
|
61
|
+
results = self.batchprocess(speech, chunk, **kwargs) if chunk and not join else self.process(speech, chunk, **kwargs)
|
62
|
+
|
63
|
+
# Return single element if single element passed in
|
64
|
+
return results[0] if self.isaudio(audio) else results
|
65
|
+
|
66
|
+
def isaudio(self, audio):
|
67
|
+
"""
|
68
|
+
Checks if input is a single audio element.
|
69
|
+
|
70
|
+
Args:
|
71
|
+
audio: audio|list
|
72
|
+
|
73
|
+
Returns:
|
74
|
+
True if input is an audio element, False otherwise
|
75
|
+
"""
|
76
|
+
|
77
|
+
return isinstance(audio, (str, tuple, np.ndarray)) or hasattr(audio, "read")
|
78
|
+
|
79
|
+
def read(self, audio, rate):
|
80
|
+
"""
|
81
|
+
Read audio to raw waveforms and sample rates.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
audio: audio|list
|
85
|
+
rate: optional sample rate
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
list of (audio data, sample rate)
|
89
|
+
"""
|
90
|
+
|
91
|
+
speech = []
|
92
|
+
for x in audio:
|
93
|
+
if isinstance(x, str) or hasattr(x, "read"):
|
94
|
+
# Read file or file-like object
|
95
|
+
raw, samplerate = sf.read(x)
|
96
|
+
elif isinstance(x, tuple):
|
97
|
+
# Input is NumPy array and sample rate
|
98
|
+
raw, samplerate = x
|
99
|
+
else:
|
100
|
+
# Input is NumPy array
|
101
|
+
raw, samplerate = x, rate
|
102
|
+
|
103
|
+
speech.append((raw, samplerate))
|
104
|
+
|
105
|
+
return speech
|
106
|
+
|
107
|
+
def process(self, speech, chunk, **kwargs):
|
108
|
+
"""
|
109
|
+
Standard processing loop. Runs a single pipeline call for all speech inputs along
|
110
|
+
with the chunk size. Returns text for each input.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
speech: list of (audio data, sample rate)
|
114
|
+
chunk: split audio into chunk seconds sized segments for processing
|
115
|
+
kwargs: generate keyword arguments
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
list of transcribed text
|
119
|
+
"""
|
120
|
+
|
121
|
+
results = []
|
122
|
+
for result in self.pipeline([self.convert(*x) for x in speech], chunk_length_s=chunk, ignore_warning=True, generate_kwargs=kwargs):
|
123
|
+
# Store result
|
124
|
+
results.append(self.clean(result["text"]))
|
125
|
+
|
126
|
+
return results
|
127
|
+
|
128
|
+
def batchprocess(self, speech, chunk, **kwargs):
|
129
|
+
"""
|
130
|
+
Batch processing loop. Runs a pipeline call per speech input. Each speech input is split
|
131
|
+
into chunk duration segments. Each segment is individually transcribed and returned along with
|
132
|
+
the raw wav snippets.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
speech: list of (audio data, sample rate)
|
136
|
+
chunk: split audio into chunk seconds sized segments for processing
|
137
|
+
kwargs: generate keyword arguments
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
list of lists of dicts - each dict has text, raw wav data for text and sample rate
|
141
|
+
"""
|
142
|
+
|
143
|
+
results = []
|
144
|
+
|
145
|
+
# Process each element individually to get time-sliced chunks
|
146
|
+
for raw, rate in speech:
|
147
|
+
# Get segments for current speech entry
|
148
|
+
segments = self.segments(raw, rate, chunk)
|
149
|
+
|
150
|
+
# Process segments, store raw data before processing given pipeline modifies it
|
151
|
+
sresults = []
|
152
|
+
for x, result in enumerate(self.pipeline([self.convert(*x) for x in segments], generate_kwargs=kwargs)):
|
153
|
+
sresults.append({"text": self.clean(result["text"]), "raw": segments[x][0], "rate": segments[x][1]})
|
154
|
+
|
155
|
+
results.append(sresults)
|
156
|
+
|
157
|
+
return results
|
158
|
+
|
159
|
+
def segments(self, raw, rate, chunk):
|
160
|
+
"""
|
161
|
+
Builds chunk duration batches.
|
162
|
+
|
163
|
+
Args:
|
164
|
+
raw: raw audio data
|
165
|
+
rate: sample rate
|
166
|
+
chunk: chunk duration size
|
167
|
+
"""
|
168
|
+
|
169
|
+
segments = []
|
170
|
+
|
171
|
+
# Split into batches, use sample rate * chunk seconds
|
172
|
+
for segment in self.batch(raw, rate * chunk):
|
173
|
+
segments.append((segment, rate))
|
174
|
+
|
175
|
+
return segments
|
176
|
+
|
177
|
+
def convert(self, raw, rate):
|
178
|
+
"""
|
179
|
+
Converts input audio to mono with a sample rate equal to the pipeline model's
|
180
|
+
sample rate.
|
181
|
+
|
182
|
+
Args:
|
183
|
+
raw: raw audio data
|
184
|
+
rate: target sample rate
|
185
|
+
|
186
|
+
Returns:
|
187
|
+
audio data ready for pipeline model
|
188
|
+
"""
|
189
|
+
|
190
|
+
# Convert stereo to mono, if necessary
|
191
|
+
raw = Signal.mono(raw)
|
192
|
+
|
193
|
+
# Resample to target sample rate
|
194
|
+
target = self.pipeline.feature_extractor.sampling_rate
|
195
|
+
return {"raw": Signal.resample(raw, rate, target), "sampling_rate": target}
|
196
|
+
|
197
|
+
def clean(self, text):
|
198
|
+
"""
|
199
|
+
Applies text normalization rules.
|
200
|
+
|
201
|
+
Args:
|
202
|
+
text: input text
|
203
|
+
|
204
|
+
Returns:
|
205
|
+
clean text
|
206
|
+
"""
|
207
|
+
|
208
|
+
# Trim whitespace
|
209
|
+
text = text.strip()
|
210
|
+
|
211
|
+
# Convert all upper case strings to capitalized case
|
212
|
+
return text.capitalize() if text.isupper() else text
|
txtai/pipeline/base.py
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
"""
|
2
|
+
Pipeline module
|
3
|
+
"""
|
4
|
+
|
5
|
+
|
6
|
+
class Pipeline:
|
7
|
+
"""
|
8
|
+
Base class for all Pipelines. The only interface requirement is to define a __call___ method.
|
9
|
+
"""
|
10
|
+
|
11
|
+
def batch(self, data, size):
|
12
|
+
"""
|
13
|
+
Splits data into separate batch sizes specified by size.
|
14
|
+
|
15
|
+
Args:
|
16
|
+
data: data elements
|
17
|
+
size: batch size
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
list of evenly sized batches with the last batch having the remaining elements
|
21
|
+
"""
|
22
|
+
|
23
|
+
return [data[x : x + size] for x in range(0, len(data), size)]
|
@@ -0,0 +1,206 @@
|
|
1
|
+
"""
|
2
|
+
FileToHTML module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import re
|
7
|
+
|
8
|
+
from subprocess import Popen
|
9
|
+
|
10
|
+
# Conditional import
|
11
|
+
try:
|
12
|
+
from tika import detector, parser
|
13
|
+
|
14
|
+
TIKA = True
|
15
|
+
except ImportError:
|
16
|
+
TIKA = False
|
17
|
+
|
18
|
+
# Conditional import
|
19
|
+
try:
|
20
|
+
from docling.document_converter import DocumentConverter
|
21
|
+
|
22
|
+
DOCLING = True
|
23
|
+
except ImportError:
|
24
|
+
DOCLING = False
|
25
|
+
|
26
|
+
from ..base import Pipeline
|
27
|
+
|
28
|
+
|
29
|
+
class FileToHTML(Pipeline):
|
30
|
+
"""
|
31
|
+
File to HTML pipeline.
|
32
|
+
"""
|
33
|
+
|
34
|
+
def __init__(self, backend="available"):
|
35
|
+
"""
|
36
|
+
Creates a new File to HTML pipeline.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
backend: backend to use to extract content, supports "tika", "docling" or "available" (default) which finds the first available
|
40
|
+
"""
|
41
|
+
|
42
|
+
# Lowercase backend parameter
|
43
|
+
backend = backend.lower() if backend else None
|
44
|
+
|
45
|
+
# Check for available backend
|
46
|
+
if backend == "available":
|
47
|
+
backend = "tika" if Tika.available() else "docling" if Docling.available() else None
|
48
|
+
|
49
|
+
# Create backend instance
|
50
|
+
self.backend = Tika() if backend == "tika" else Docling() if backend == "docling" else None
|
51
|
+
|
52
|
+
def __call__(self, path):
|
53
|
+
"""
|
54
|
+
Converts file at path to HTML. Returns None if no backend is available.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
path: input file path
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
html if a backend is available, otherwise returns None
|
61
|
+
"""
|
62
|
+
|
63
|
+
return self.backend(path) if self.backend else None
|
64
|
+
|
65
|
+
|
66
|
+
class Tika:
|
67
|
+
"""
|
68
|
+
File to HTML conversion via Apache Tika.
|
69
|
+
"""
|
70
|
+
|
71
|
+
@staticmethod
|
72
|
+
def available():
|
73
|
+
"""
|
74
|
+
Checks if a Java executable is available and Tika is installed.
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
True if Java is available and Tika is installed, False otherwise
|
78
|
+
"""
|
79
|
+
|
80
|
+
# Get path to Java executable
|
81
|
+
path = os.environ.get("TIKA_JAVA", "java")
|
82
|
+
|
83
|
+
# pylint: disable=R1732,W0702,W1514
|
84
|
+
# Check if Java binary is available on path
|
85
|
+
try:
|
86
|
+
_ = Popen(path, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"))
|
87
|
+
except:
|
88
|
+
return False
|
89
|
+
|
90
|
+
# Return True if Java is available AND Tika is installed
|
91
|
+
return TIKA
|
92
|
+
|
93
|
+
def __init__(self):
|
94
|
+
"""
|
95
|
+
Creates a new Tika instance.
|
96
|
+
"""
|
97
|
+
|
98
|
+
if not Tika.available():
|
99
|
+
raise ImportError('Tika engine is not available - install "pipeline" extra to enable. Also check that Java is available.')
|
100
|
+
|
101
|
+
def __call__(self, path):
|
102
|
+
"""
|
103
|
+
Parses content to HTML.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
path: file path
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
html
|
110
|
+
"""
|
111
|
+
|
112
|
+
# Skip parsing if input is plain text or HTML
|
113
|
+
mimetype = detector.from_file(path)
|
114
|
+
if mimetype in ("text/plain", "text/html", "text/xhtml"):
|
115
|
+
return None
|
116
|
+
|
117
|
+
# Parse content to HTML
|
118
|
+
parsed = parser.from_file(path, xmlContent=True)
|
119
|
+
return parsed["content"]
|
120
|
+
|
121
|
+
|
122
|
+
class Docling:
|
123
|
+
"""
|
124
|
+
File to HTML conversion via Docling.
|
125
|
+
"""
|
126
|
+
|
127
|
+
@staticmethod
|
128
|
+
def available():
|
129
|
+
"""
|
130
|
+
Checks if Docling is available.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
True if Docling is available, False otherwise
|
134
|
+
"""
|
135
|
+
|
136
|
+
return DOCLING
|
137
|
+
|
138
|
+
def __init__(self):
|
139
|
+
"""
|
140
|
+
Creates a new Docling instance.
|
141
|
+
"""
|
142
|
+
|
143
|
+
if not Docling.available():
|
144
|
+
raise ImportError('Docling engine is not available - install "pipeline" extra to enable')
|
145
|
+
|
146
|
+
self.converter = DocumentConverter()
|
147
|
+
|
148
|
+
def __call__(self, path):
|
149
|
+
"""
|
150
|
+
Parses content to HTML.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
path: file path
|
154
|
+
|
155
|
+
Returns:
|
156
|
+
html
|
157
|
+
"""
|
158
|
+
|
159
|
+
# Skip parsing if input is HTML
|
160
|
+
if self.ishtml(path):
|
161
|
+
return None
|
162
|
+
|
163
|
+
# Parse content to HTML
|
164
|
+
html = self.converter.convert(path).document.export_to_html(html_head="<head/>")
|
165
|
+
|
166
|
+
# Normalize HTML and return
|
167
|
+
return self.normalize(html)
|
168
|
+
|
169
|
+
def ishtml(self, path):
|
170
|
+
"""
|
171
|
+
Detects if this file looks like HTML.
|
172
|
+
|
173
|
+
Args:
|
174
|
+
path: file path
|
175
|
+
|
176
|
+
Returns:
|
177
|
+
True if this is HTML
|
178
|
+
"""
|
179
|
+
|
180
|
+
with open(path, "rb") as f:
|
181
|
+
# Read first 1024 bytes, ignore encoding errors and strip leading/trailing whitespace
|
182
|
+
content = f.read(1024)
|
183
|
+
content = content.decode("ascii", errors="ignore").lower().strip()
|
184
|
+
|
185
|
+
# Check for HTML
|
186
|
+
return re.search(r"<!doctype\s+html|<html|<head|<body", content)
|
187
|
+
|
188
|
+
def normalize(self, html):
|
189
|
+
"""
|
190
|
+
Applies normalization rules to make HTML consistent with other text extraction backends.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
html: input html
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
normalized html
|
197
|
+
"""
|
198
|
+
|
199
|
+
# Wrap content with a body tag, if necessary
|
200
|
+
html = html.replace("<head/>", "<head/><body>").replace("</html>", "</body></html>") if "<body>" not in html else html
|
201
|
+
|
202
|
+
# Remove bullets from list items
|
203
|
+
html = re.sub(r"<li>\xb7 ", r"<li>", html)
|
204
|
+
|
205
|
+
# Add spacing between paragraphs
|
206
|
+
return html.replace("</p>", "</p><p/>")
|