mseep-txtai 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mseep_txtai-9.1.1.dist-info/METADATA +262 -0
- mseep_txtai-9.1.1.dist-info/RECORD +251 -0
- mseep_txtai-9.1.1.dist-info/WHEEL +5 -0
- mseep_txtai-9.1.1.dist-info/licenses/LICENSE +190 -0
- mseep_txtai-9.1.1.dist-info/top_level.txt +1 -0
- txtai/__init__.py +16 -0
- txtai/agent/__init__.py +12 -0
- txtai/agent/base.py +54 -0
- txtai/agent/factory.py +39 -0
- txtai/agent/model.py +107 -0
- txtai/agent/placeholder.py +16 -0
- txtai/agent/tool/__init__.py +7 -0
- txtai/agent/tool/embeddings.py +69 -0
- txtai/agent/tool/factory.py +130 -0
- txtai/agent/tool/function.py +49 -0
- txtai/ann/__init__.py +7 -0
- txtai/ann/base.py +153 -0
- txtai/ann/dense/__init__.py +11 -0
- txtai/ann/dense/annoy.py +72 -0
- txtai/ann/dense/factory.py +76 -0
- txtai/ann/dense/faiss.py +233 -0
- txtai/ann/dense/hnsw.py +104 -0
- txtai/ann/dense/numpy.py +164 -0
- txtai/ann/dense/pgvector.py +323 -0
- txtai/ann/dense/sqlite.py +303 -0
- txtai/ann/dense/torch.py +38 -0
- txtai/ann/sparse/__init__.py +7 -0
- txtai/ann/sparse/factory.py +61 -0
- txtai/ann/sparse/ivfsparse.py +377 -0
- txtai/ann/sparse/pgsparse.py +56 -0
- txtai/api/__init__.py +18 -0
- txtai/api/application.py +134 -0
- txtai/api/authorization.py +53 -0
- txtai/api/base.py +159 -0
- txtai/api/cluster.py +295 -0
- txtai/api/extension.py +19 -0
- txtai/api/factory.py +40 -0
- txtai/api/responses/__init__.py +7 -0
- txtai/api/responses/factory.py +30 -0
- txtai/api/responses/json.py +56 -0
- txtai/api/responses/messagepack.py +51 -0
- txtai/api/route.py +41 -0
- txtai/api/routers/__init__.py +25 -0
- txtai/api/routers/agent.py +38 -0
- txtai/api/routers/caption.py +42 -0
- txtai/api/routers/embeddings.py +280 -0
- txtai/api/routers/entity.py +42 -0
- txtai/api/routers/extractor.py +28 -0
- txtai/api/routers/labels.py +47 -0
- txtai/api/routers/llm.py +61 -0
- txtai/api/routers/objects.py +42 -0
- txtai/api/routers/openai.py +191 -0
- txtai/api/routers/rag.py +61 -0
- txtai/api/routers/reranker.py +46 -0
- txtai/api/routers/segmentation.py +42 -0
- txtai/api/routers/similarity.py +48 -0
- txtai/api/routers/summary.py +46 -0
- txtai/api/routers/tabular.py +42 -0
- txtai/api/routers/textractor.py +42 -0
- txtai/api/routers/texttospeech.py +33 -0
- txtai/api/routers/transcription.py +42 -0
- txtai/api/routers/translation.py +46 -0
- txtai/api/routers/upload.py +36 -0
- txtai/api/routers/workflow.py +28 -0
- txtai/app/__init__.py +5 -0
- txtai/app/base.py +821 -0
- txtai/archive/__init__.py +9 -0
- txtai/archive/base.py +104 -0
- txtai/archive/compress.py +51 -0
- txtai/archive/factory.py +25 -0
- txtai/archive/tar.py +49 -0
- txtai/archive/zip.py +35 -0
- txtai/cloud/__init__.py +8 -0
- txtai/cloud/base.py +106 -0
- txtai/cloud/factory.py +70 -0
- txtai/cloud/hub.py +101 -0
- txtai/cloud/storage.py +125 -0
- txtai/console/__init__.py +5 -0
- txtai/console/__main__.py +22 -0
- txtai/console/base.py +264 -0
- txtai/data/__init__.py +10 -0
- txtai/data/base.py +138 -0
- txtai/data/labels.py +42 -0
- txtai/data/questions.py +135 -0
- txtai/data/sequences.py +48 -0
- txtai/data/texts.py +68 -0
- txtai/data/tokens.py +28 -0
- txtai/database/__init__.py +14 -0
- txtai/database/base.py +342 -0
- txtai/database/client.py +227 -0
- txtai/database/duckdb.py +150 -0
- txtai/database/embedded.py +76 -0
- txtai/database/encoder/__init__.py +8 -0
- txtai/database/encoder/base.py +37 -0
- txtai/database/encoder/factory.py +56 -0
- txtai/database/encoder/image.py +43 -0
- txtai/database/encoder/serialize.py +28 -0
- txtai/database/factory.py +77 -0
- txtai/database/rdbms.py +569 -0
- txtai/database/schema/__init__.py +6 -0
- txtai/database/schema/orm.py +99 -0
- txtai/database/schema/statement.py +98 -0
- txtai/database/sql/__init__.py +8 -0
- txtai/database/sql/aggregate.py +178 -0
- txtai/database/sql/base.py +189 -0
- txtai/database/sql/expression.py +404 -0
- txtai/database/sql/token.py +342 -0
- txtai/database/sqlite.py +57 -0
- txtai/embeddings/__init__.py +7 -0
- txtai/embeddings/base.py +1107 -0
- txtai/embeddings/index/__init__.py +14 -0
- txtai/embeddings/index/action.py +15 -0
- txtai/embeddings/index/autoid.py +92 -0
- txtai/embeddings/index/configuration.py +71 -0
- txtai/embeddings/index/documents.py +86 -0
- txtai/embeddings/index/functions.py +155 -0
- txtai/embeddings/index/indexes.py +199 -0
- txtai/embeddings/index/indexids.py +60 -0
- txtai/embeddings/index/reducer.py +104 -0
- txtai/embeddings/index/stream.py +67 -0
- txtai/embeddings/index/transform.py +205 -0
- txtai/embeddings/search/__init__.py +11 -0
- txtai/embeddings/search/base.py +344 -0
- txtai/embeddings/search/errors.py +9 -0
- txtai/embeddings/search/explain.py +120 -0
- txtai/embeddings/search/ids.py +61 -0
- txtai/embeddings/search/query.py +69 -0
- txtai/embeddings/search/scan.py +196 -0
- txtai/embeddings/search/terms.py +46 -0
- txtai/graph/__init__.py +10 -0
- txtai/graph/base.py +769 -0
- txtai/graph/factory.py +61 -0
- txtai/graph/networkx.py +275 -0
- txtai/graph/query.py +181 -0
- txtai/graph/rdbms.py +113 -0
- txtai/graph/topics.py +166 -0
- txtai/models/__init__.py +9 -0
- txtai/models/models.py +268 -0
- txtai/models/onnx.py +133 -0
- txtai/models/pooling/__init__.py +9 -0
- txtai/models/pooling/base.py +141 -0
- txtai/models/pooling/cls.py +28 -0
- txtai/models/pooling/factory.py +144 -0
- txtai/models/pooling/late.py +173 -0
- txtai/models/pooling/mean.py +33 -0
- txtai/models/pooling/muvera.py +164 -0
- txtai/models/registry.py +37 -0
- txtai/models/tokendetection.py +122 -0
- txtai/pipeline/__init__.py +17 -0
- txtai/pipeline/audio/__init__.py +11 -0
- txtai/pipeline/audio/audiomixer.py +58 -0
- txtai/pipeline/audio/audiostream.py +94 -0
- txtai/pipeline/audio/microphone.py +244 -0
- txtai/pipeline/audio/signal.py +186 -0
- txtai/pipeline/audio/texttoaudio.py +60 -0
- txtai/pipeline/audio/texttospeech.py +553 -0
- txtai/pipeline/audio/transcription.py +212 -0
- txtai/pipeline/base.py +23 -0
- txtai/pipeline/data/__init__.py +10 -0
- txtai/pipeline/data/filetohtml.py +206 -0
- txtai/pipeline/data/htmltomd.py +414 -0
- txtai/pipeline/data/segmentation.py +178 -0
- txtai/pipeline/data/tabular.py +155 -0
- txtai/pipeline/data/textractor.py +139 -0
- txtai/pipeline/data/tokenizer.py +112 -0
- txtai/pipeline/factory.py +77 -0
- txtai/pipeline/hfmodel.py +111 -0
- txtai/pipeline/hfpipeline.py +96 -0
- txtai/pipeline/image/__init__.py +7 -0
- txtai/pipeline/image/caption.py +55 -0
- txtai/pipeline/image/imagehash.py +90 -0
- txtai/pipeline/image/objects.py +80 -0
- txtai/pipeline/llm/__init__.py +11 -0
- txtai/pipeline/llm/factory.py +86 -0
- txtai/pipeline/llm/generation.py +173 -0
- txtai/pipeline/llm/huggingface.py +218 -0
- txtai/pipeline/llm/litellm.py +90 -0
- txtai/pipeline/llm/llama.py +152 -0
- txtai/pipeline/llm/llm.py +75 -0
- txtai/pipeline/llm/rag.py +477 -0
- txtai/pipeline/nop.py +14 -0
- txtai/pipeline/tensors.py +52 -0
- txtai/pipeline/text/__init__.py +13 -0
- txtai/pipeline/text/crossencoder.py +70 -0
- txtai/pipeline/text/entity.py +140 -0
- txtai/pipeline/text/labels.py +137 -0
- txtai/pipeline/text/lateencoder.py +103 -0
- txtai/pipeline/text/questions.py +48 -0
- txtai/pipeline/text/reranker.py +57 -0
- txtai/pipeline/text/similarity.py +83 -0
- txtai/pipeline/text/summary.py +98 -0
- txtai/pipeline/text/translation.py +298 -0
- txtai/pipeline/train/__init__.py +7 -0
- txtai/pipeline/train/hfonnx.py +196 -0
- txtai/pipeline/train/hftrainer.py +398 -0
- txtai/pipeline/train/mlonnx.py +63 -0
- txtai/scoring/__init__.py +12 -0
- txtai/scoring/base.py +188 -0
- txtai/scoring/bm25.py +29 -0
- txtai/scoring/factory.py +95 -0
- txtai/scoring/pgtext.py +181 -0
- txtai/scoring/sif.py +32 -0
- txtai/scoring/sparse.py +218 -0
- txtai/scoring/terms.py +499 -0
- txtai/scoring/tfidf.py +358 -0
- txtai/serialize/__init__.py +10 -0
- txtai/serialize/base.py +85 -0
- txtai/serialize/errors.py +9 -0
- txtai/serialize/factory.py +29 -0
- txtai/serialize/messagepack.py +42 -0
- txtai/serialize/pickle.py +98 -0
- txtai/serialize/serializer.py +46 -0
- txtai/util/__init__.py +7 -0
- txtai/util/resolver.py +32 -0
- txtai/util/sparsearray.py +62 -0
- txtai/util/template.py +16 -0
- txtai/vectors/__init__.py +8 -0
- txtai/vectors/base.py +476 -0
- txtai/vectors/dense/__init__.py +12 -0
- txtai/vectors/dense/external.py +55 -0
- txtai/vectors/dense/factory.py +121 -0
- txtai/vectors/dense/huggingface.py +44 -0
- txtai/vectors/dense/litellm.py +86 -0
- txtai/vectors/dense/llama.py +84 -0
- txtai/vectors/dense/m2v.py +67 -0
- txtai/vectors/dense/sbert.py +92 -0
- txtai/vectors/dense/words.py +211 -0
- txtai/vectors/recovery.py +57 -0
- txtai/vectors/sparse/__init__.py +7 -0
- txtai/vectors/sparse/base.py +90 -0
- txtai/vectors/sparse/factory.py +55 -0
- txtai/vectors/sparse/sbert.py +34 -0
- txtai/version.py +6 -0
- txtai/workflow/__init__.py +8 -0
- txtai/workflow/base.py +184 -0
- txtai/workflow/execute.py +99 -0
- txtai/workflow/factory.py +42 -0
- txtai/workflow/task/__init__.py +18 -0
- txtai/workflow/task/base.py +490 -0
- txtai/workflow/task/console.py +24 -0
- txtai/workflow/task/export.py +64 -0
- txtai/workflow/task/factory.py +89 -0
- txtai/workflow/task/file.py +28 -0
- txtai/workflow/task/image.py +36 -0
- txtai/workflow/task/retrieve.py +61 -0
- txtai/workflow/task/service.py +102 -0
- txtai/workflow/task/storage.py +110 -0
- txtai/workflow/task/stream.py +33 -0
- txtai/workflow/task/template.py +116 -0
- txtai/workflow/task/url.py +20 -0
- txtai/workflow/task/workflow.py +14 -0
@@ -0,0 +1,414 @@
|
|
1
|
+
"""
|
2
|
+
HTMLToMarkdown module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import re
|
6
|
+
|
7
|
+
# Conditional import
|
8
|
+
try:
|
9
|
+
from bs4 import BeautifulSoup, NavigableString
|
10
|
+
|
11
|
+
SOUP = True
|
12
|
+
except ImportError:
|
13
|
+
SOUP = False
|
14
|
+
|
15
|
+
from ..base import Pipeline
|
16
|
+
|
17
|
+
|
18
|
+
class HTMLToMarkdown(Pipeline):
|
19
|
+
"""
|
20
|
+
HTML to Markdown pipeline.
|
21
|
+
|
22
|
+
Markdown formatting is applied for headings, blockquotes, lists, code, tables and text. Visual formatting is also
|
23
|
+
included (bold, italic etc).
|
24
|
+
|
25
|
+
This pipeline searches for the best node that has relevant text, often found with an article, main or body tag.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(self, paragraphs=False, sections=False):
|
29
|
+
"""
|
30
|
+
Create a new Extract instance.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
paragraphs: True if paragraph parsing enabled, False otherwise
|
34
|
+
sections: True if section parsing enabled, False otherwise
|
35
|
+
"""
|
36
|
+
|
37
|
+
if not SOUP:
|
38
|
+
raise ImportError('HTMLToMarkdown pipeline is not available - install "pipeline" extra to enable')
|
39
|
+
|
40
|
+
self.paragraphs = paragraphs
|
41
|
+
self.sections = sections
|
42
|
+
|
43
|
+
def __call__(self, html):
|
44
|
+
"""
|
45
|
+
Transforms input HTML into Markdown formatted text.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
html: input html
|
49
|
+
|
50
|
+
Returns:
|
51
|
+
markdown formatted text
|
52
|
+
"""
|
53
|
+
|
54
|
+
# HTML Parser
|
55
|
+
soup = BeautifulSoup(html, features="html.parser")
|
56
|
+
|
57
|
+
# Ignore script and style tags
|
58
|
+
for script in soup.find_all(["script", "style"]):
|
59
|
+
script.decompose()
|
60
|
+
|
61
|
+
# Check for article sections
|
62
|
+
article = next((x for x in ["article", "main"] if soup.find(x)), None)
|
63
|
+
|
64
|
+
# Extract text from each section element
|
65
|
+
nodes = []
|
66
|
+
for node in soup.find_all(article if article else "body"):
|
67
|
+
# Skip article sections without at least 1 paragraph
|
68
|
+
if not article or node.find("p"):
|
69
|
+
nodes.append(self.process(node, article))
|
70
|
+
|
71
|
+
# Return extracted text, fallback to default text extraction if no nodes found
|
72
|
+
return "\n".join(self.metadata(soup) + nodes) if nodes else self.default(soup)
|
73
|
+
|
74
|
+
def process(self, node, article):
|
75
|
+
"""
|
76
|
+
Extracts text from a node. This method applies transforms for headings, blockquotes, lists, code, tables and text.
|
77
|
+
Page breaks are detected and reflected in the output text as a page break character.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
node: input node
|
81
|
+
article: True if the main section node is an article
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
node text
|
85
|
+
"""
|
86
|
+
|
87
|
+
if self.isheader(node):
|
88
|
+
return self.header(node, article)
|
89
|
+
|
90
|
+
if node.name in ("blockquote", "q"):
|
91
|
+
return self.block(node)
|
92
|
+
|
93
|
+
if node.name in ("ul", "ol"):
|
94
|
+
return self.items(node, article)
|
95
|
+
|
96
|
+
if node.name in ("code", "pre"):
|
97
|
+
return self.code(node)
|
98
|
+
|
99
|
+
if node.name == "table":
|
100
|
+
return self.table(node, article)
|
101
|
+
|
102
|
+
# Nodes to skip
|
103
|
+
if node.name in ("aside",) + (() if article else ("header", "footer")):
|
104
|
+
return ""
|
105
|
+
|
106
|
+
# Get page break symbol, if available
|
107
|
+
page = node.name and node.get("class") and "page" in node.get("class")
|
108
|
+
|
109
|
+
# Get node children
|
110
|
+
children = self.children(node)
|
111
|
+
|
112
|
+
# Join elements into text
|
113
|
+
if self.iscontainer(node, children):
|
114
|
+
texts = [self.process(node, article) for node in children]
|
115
|
+
text = "\n".join(text for text in texts if text or not article)
|
116
|
+
else:
|
117
|
+
text = self.text(node, article)
|
118
|
+
|
119
|
+
# Add page breaks, if section parsing enabled. Otherwise add node text.
|
120
|
+
return f"{text}\f" if page and self.sections else text
|
121
|
+
|
122
|
+
def metadata(self, node):
|
123
|
+
"""
|
124
|
+
Builds a metadata section. The metadata section consists of the title and
|
125
|
+
description fields.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
node: input document node
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
metadata as a list
|
132
|
+
"""
|
133
|
+
|
134
|
+
title = node.find("title")
|
135
|
+
metadata = [f"**{title.text.strip()}**"] if title and title.text else []
|
136
|
+
|
137
|
+
description = node.find("meta", attrs={"name": "description"})
|
138
|
+
if description and description["content"]:
|
139
|
+
metadata.append(f"\n*{description['content'].strip()}*")
|
140
|
+
|
141
|
+
# Add separator
|
142
|
+
if metadata:
|
143
|
+
metadata.append("\f" if self.sections else "\n\n")
|
144
|
+
|
145
|
+
return metadata
|
146
|
+
|
147
|
+
def default(self, soup):
|
148
|
+
"""
|
149
|
+
Default text handler when valid HTML isn't detected.
|
150
|
+
|
151
|
+
Args:
|
152
|
+
soup: BeautifulSoup object
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
text
|
156
|
+
"""
|
157
|
+
|
158
|
+
lines = []
|
159
|
+
for line in soup.get_text().split("\n"):
|
160
|
+
# Detect markdown headings and add page breaks
|
161
|
+
lines.append(f"\f{line}" if self.sections and re.search(r"^#+ ", line) else line)
|
162
|
+
|
163
|
+
return "\n".join(lines)
|
164
|
+
|
165
|
+
def text(self, node, article):
|
166
|
+
"""
|
167
|
+
Text handler. This method flattens a node and it's children to text.
|
168
|
+
|
169
|
+
Args:
|
170
|
+
node: input node
|
171
|
+
article: True if the main section node is an article
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
node text
|
175
|
+
"""
|
176
|
+
|
177
|
+
# Get node children if available, otherwise use node as item
|
178
|
+
items = self.children(node)
|
179
|
+
items = items if items else [node]
|
180
|
+
|
181
|
+
# Apply emphasis and link formatting
|
182
|
+
texts = []
|
183
|
+
for x in items:
|
184
|
+
target, text = x if x.name else node, x.text
|
185
|
+
|
186
|
+
if text.strip():
|
187
|
+
if target.name in ("b", "strong"):
|
188
|
+
text = f"**{text.strip()}** "
|
189
|
+
elif target.name in ("i", "em"):
|
190
|
+
text = f"*{text.strip()}* "
|
191
|
+
elif target.name == "a":
|
192
|
+
text = f"[{text.strip()}]({target.get('href')}) "
|
193
|
+
|
194
|
+
texts.append(text)
|
195
|
+
|
196
|
+
# Join text elements
|
197
|
+
text = "".join(texts)
|
198
|
+
|
199
|
+
# Article text processing
|
200
|
+
text = self.articletext(node, text) if article else text
|
201
|
+
|
202
|
+
# Return text, strip leading/trailing whitespace if this is a string only node
|
203
|
+
text = text if node.name and text else text.strip()
|
204
|
+
|
205
|
+
return text
|
206
|
+
|
207
|
+
def header(self, node, article):
|
208
|
+
"""
|
209
|
+
Header handler. This method transforms a HTML heading into a Markdown formatted heading.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
node: input node
|
213
|
+
article: True if the main section node is an article
|
214
|
+
|
215
|
+
Returns:
|
216
|
+
heading as markdown
|
217
|
+
"""
|
218
|
+
|
219
|
+
# Get heading level and text
|
220
|
+
level = "#" * int(node.name[1])
|
221
|
+
text = self.text(node, article)
|
222
|
+
|
223
|
+
# Add section break or newline, if necessary
|
224
|
+
level = f"\f{level}" if self.sections else f"\n{level}"
|
225
|
+
|
226
|
+
# Return formatted header. Remove leading whitespace as it was added before level in step above.
|
227
|
+
return f"{level} {text.lstrip()}" if text.strip() else ""
|
228
|
+
|
229
|
+
def block(self, node):
|
230
|
+
"""
|
231
|
+
Blockquote handler. This method transforms a HTML blockquote or q block into a Markdown formatted
|
232
|
+
blockquote
|
233
|
+
|
234
|
+
Args:
|
235
|
+
node: input node
|
236
|
+
|
237
|
+
Returns:
|
238
|
+
block as markdown
|
239
|
+
"""
|
240
|
+
|
241
|
+
text = "\n".join(f"> {x}" for x in node.text.strip().split("\n"))
|
242
|
+
return f"{text}\n\n" if self.paragraphs else f"{text}\n"
|
243
|
+
|
244
|
+
def items(self, node, article):
|
245
|
+
"""
|
246
|
+
List handler. This method transforms a HTML ordered/unordered list into a Markdown formatted list.
|
247
|
+
|
248
|
+
Args:
|
249
|
+
node: input node
|
250
|
+
article: True if the main section node is an article
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
list as markdown
|
254
|
+
"""
|
255
|
+
|
256
|
+
elements = []
|
257
|
+
for x, element in enumerate(node.find_all("li")):
|
258
|
+
# Unordered lists use dashes. Ordered lists use numbers.
|
259
|
+
prefix = "-" if node.name == "ul" else f"{x + 1}."
|
260
|
+
|
261
|
+
# List item text
|
262
|
+
text = self.process(element, article)
|
263
|
+
|
264
|
+
# Add list element
|
265
|
+
if text:
|
266
|
+
elements.append(f"{prefix} {text}")
|
267
|
+
|
268
|
+
# Join elements together as string
|
269
|
+
return "\n".join(elements)
|
270
|
+
|
271
|
+
def code(self, node):
|
272
|
+
"""
|
273
|
+
Code block handler. This method transforms a HTML pre or code block into a Markdown formatted
|
274
|
+
code block.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
node: input node
|
278
|
+
|
279
|
+
Returns:
|
280
|
+
code as markdown
|
281
|
+
"""
|
282
|
+
|
283
|
+
text = f"```\n{node.text.strip()}\n```"
|
284
|
+
return f"{text}\n\n" if self.paragraphs else f"{text}\n"
|
285
|
+
|
286
|
+
def table(self, node, article):
|
287
|
+
"""
|
288
|
+
Table handler. This method transforms a HTML table into a Markdown formatted table.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
node: input node
|
292
|
+
article: True if the main section node is an article
|
293
|
+
|
294
|
+
Returns:
|
295
|
+
table as markdown
|
296
|
+
"""
|
297
|
+
|
298
|
+
elements, header = [], False
|
299
|
+
|
300
|
+
# Process all rows
|
301
|
+
rows = node.find_all("tr")
|
302
|
+
for row in rows:
|
303
|
+
# Get list of columns for row
|
304
|
+
columns = row.find_all(lambda tag: tag.name in ("th", "td"))
|
305
|
+
|
306
|
+
# Add columns with separator
|
307
|
+
elements.append(f"|{'|'.join(self.process(column, article) for column in columns)}|")
|
308
|
+
|
309
|
+
# If there are multiple rows, add header format row
|
310
|
+
if not header and len(rows) > 1:
|
311
|
+
elements.append(f"{'|---' * len(columns)}|")
|
312
|
+
header = True
|
313
|
+
|
314
|
+
# Join elements together as string
|
315
|
+
return "\n".join(elements)
|
316
|
+
|
317
|
+
def iscontainer(self, node, children):
|
318
|
+
"""
|
319
|
+
Analyzes a node and it's children to determine if this is a container element. A container
|
320
|
+
element is defined as being a div, body, article or not having any string elements as children.
|
321
|
+
|
322
|
+
Args:
|
323
|
+
node: input node
|
324
|
+
nodes: input node's children
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
True if this is a container element, False otherwise
|
328
|
+
"""
|
329
|
+
|
330
|
+
return children and (node.name in ("div", "body", "article") or not any(isinstance(x, NavigableString) for x in children))
|
331
|
+
|
332
|
+
def children(self, node):
|
333
|
+
"""
|
334
|
+
Gets the node children, if available.
|
335
|
+
|
336
|
+
Args:
|
337
|
+
node: input node
|
338
|
+
|
339
|
+
Returns:
|
340
|
+
node children or None if not available
|
341
|
+
"""
|
342
|
+
|
343
|
+
if node.name and node.contents:
|
344
|
+
# Iterate over children and remove whitespace-only string nodes
|
345
|
+
return [node for node in node.contents if node.name or node.text.strip()]
|
346
|
+
|
347
|
+
return None
|
348
|
+
|
349
|
+
def articletext(self, node, text):
|
350
|
+
"""
|
351
|
+
Transforms node text using article parsing rules. Article parsing is designed to extract text content from web articles.
|
352
|
+
It ignores navigation headers and other superfluous elements.
|
353
|
+
|
354
|
+
Args:
|
355
|
+
node: input node
|
356
|
+
text: current text
|
357
|
+
|
358
|
+
Returns:
|
359
|
+
article text
|
360
|
+
"""
|
361
|
+
|
362
|
+
# List of valid text nodes
|
363
|
+
valid = ("p", "th", "td", "li", "a", "b", "strong", "i", "em")
|
364
|
+
|
365
|
+
# Check if this node is valid or it's part of a table cell
|
366
|
+
valid = node.name in valid or (node.parent and node.parent.name in ("th", "td"))
|
367
|
+
|
368
|
+
# Check if text is valid article text
|
369
|
+
text = text if (valid or self.isheader(node)) and not self.islink(node) else ""
|
370
|
+
if text:
|
371
|
+
# Replace non-breaking space plus newline with double newline
|
372
|
+
text = text.replace("\xa0\n", "\n\n")
|
373
|
+
|
374
|
+
# Format paragraph whitespace
|
375
|
+
if node.name == "p":
|
376
|
+
text = f"{text.strip()}\n\n" if self.paragraphs else f"{text.strip()}\n"
|
377
|
+
|
378
|
+
return text
|
379
|
+
|
380
|
+
def isheader(self, node):
|
381
|
+
"""
|
382
|
+
Checks if node is a header node.
|
383
|
+
|
384
|
+
Args:
|
385
|
+
node: input node
|
386
|
+
|
387
|
+
Returns:
|
388
|
+
True if node is a header node, False otherwise
|
389
|
+
"""
|
390
|
+
|
391
|
+
return node.name in ("h1", "h2", "h3", "h4", "h5", "h6")
|
392
|
+
|
393
|
+
def islink(self, node):
|
394
|
+
"""
|
395
|
+
Checks if node is a link node. This method does not consider links without tables as link nodes.
|
396
|
+
|
397
|
+
Args:
|
398
|
+
node: input node
|
399
|
+
|
400
|
+
Returns:
|
401
|
+
True if node is a link node, False otherwise
|
402
|
+
"""
|
403
|
+
|
404
|
+
# Check if this is a link node or link container
|
405
|
+
link, parent = False, node
|
406
|
+
while parent:
|
407
|
+
if parent.name == "a":
|
408
|
+
link = True
|
409
|
+
break
|
410
|
+
|
411
|
+
parent = parent.parent
|
412
|
+
|
413
|
+
# Return if this node or any parents are a link. Ignore links in table cells.
|
414
|
+
return link and node.parent.name not in ("th", "td")
|
@@ -0,0 +1,178 @@
|
|
1
|
+
"""
|
2
|
+
Segmentation module
|
3
|
+
"""
|
4
|
+
|
5
|
+
import re
|
6
|
+
|
7
|
+
# Conditional import
|
8
|
+
try:
|
9
|
+
from nltk import sent_tokenize
|
10
|
+
|
11
|
+
NLTK = True
|
12
|
+
except ImportError:
|
13
|
+
NLTK = False
|
14
|
+
|
15
|
+
# Conditional import
|
16
|
+
try:
|
17
|
+
import chonkie
|
18
|
+
|
19
|
+
CHONKIE = True
|
20
|
+
except ImportError:
|
21
|
+
CHONKIE = False
|
22
|
+
|
23
|
+
from ..base import Pipeline
|
24
|
+
|
25
|
+
|
26
|
+
class Segmentation(Pipeline):
|
27
|
+
"""
|
28
|
+
Segments text into logical units.
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(
|
32
|
+
self, sentences=False, lines=False, paragraphs=False, minlength=None, join=False, sections=False, cleantext=True, chunker=None, **kwargs
|
33
|
+
):
|
34
|
+
"""
|
35
|
+
Creates a new Segmentation pipeline.
|
36
|
+
|
37
|
+
Args:
|
38
|
+
sentences: tokenize text into sentences if True, defaults to False
|
39
|
+
lines: tokenizes text into lines if True, defaults to False
|
40
|
+
paragraphs: tokenizes text into paragraphs if True, defaults to False
|
41
|
+
minlength: require at least minlength characters per text element, defaults to None
|
42
|
+
join: joins tokenized sections back together if True, defaults to False
|
43
|
+
sections: tokenizes text into sections if True, defaults to False. Splits using section or page breaks, depending on what's available
|
44
|
+
cleantext: apply text cleaning rules, defaults to True
|
45
|
+
chunker: creates a third-party chunker to tokenize text if set, defaults to None
|
46
|
+
kwargs: additional keyword arguments
|
47
|
+
"""
|
48
|
+
|
49
|
+
if not NLTK and sentences:
|
50
|
+
raise ImportError('NLTK is not available - install "pipeline" extra to enable')
|
51
|
+
|
52
|
+
if not CHONKIE and chunker:
|
53
|
+
raise ImportError('Chonkie is not available - install "pipeline" extra to enable')
|
54
|
+
|
55
|
+
self.sentences = sentences
|
56
|
+
self.lines = lines
|
57
|
+
self.paragraphs = paragraphs
|
58
|
+
self.sections = sections
|
59
|
+
self.minlength = minlength
|
60
|
+
self.join = join
|
61
|
+
self.cleantext = cleantext
|
62
|
+
|
63
|
+
# Create a third-party chunker, if applicable
|
64
|
+
self.chunker = self.createchunker(chunker, **kwargs) if chunker else None
|
65
|
+
|
66
|
+
def __call__(self, text):
|
67
|
+
"""
|
68
|
+
Segments text into semantic units.
|
69
|
+
|
70
|
+
This method supports text as a string or a list. If the input is a string, the return
|
71
|
+
type is text|list. If text is a list, a list of returned, this could be a
|
72
|
+
list of text or a list of lists depending on the tokenization strategy.
|
73
|
+
|
74
|
+
Args:
|
75
|
+
text: text|list
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
segmented text
|
79
|
+
"""
|
80
|
+
|
81
|
+
# Get inputs
|
82
|
+
texts = [text] if not isinstance(text, list) else text
|
83
|
+
|
84
|
+
# Extract text for each input file
|
85
|
+
results = []
|
86
|
+
for value in texts:
|
87
|
+
# Get text
|
88
|
+
value = self.text(value)
|
89
|
+
|
90
|
+
# Parse and add extracted results
|
91
|
+
results.append(self.parse(value))
|
92
|
+
|
93
|
+
return results[0] if isinstance(text, str) else results
|
94
|
+
|
95
|
+
def text(self, text):
|
96
|
+
"""
|
97
|
+
Hook to allow extracting text out of input text object.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
text: object to extract text from
|
101
|
+
"""
|
102
|
+
|
103
|
+
return text
|
104
|
+
|
105
|
+
def parse(self, text):
|
106
|
+
"""
|
107
|
+
Splits and cleans text based on the current parameters.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
text: input text
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
parsed and clean content
|
114
|
+
"""
|
115
|
+
|
116
|
+
content = None
|
117
|
+
|
118
|
+
if self.chunker:
|
119
|
+
# pylint: disable=E1102
|
120
|
+
content = [self.clean(x.text) for x in self.chunker(text)]
|
121
|
+
elif self.sentences:
|
122
|
+
content = [self.clean(x) for x in sent_tokenize(text)]
|
123
|
+
elif self.lines:
|
124
|
+
content = [self.clean(x) for x in re.split(r"\n{1,}", text)]
|
125
|
+
elif self.paragraphs:
|
126
|
+
content = [self.clean(x) for x in re.split(r"\n{2,}", text)]
|
127
|
+
elif self.sections:
|
128
|
+
split = r"\f" if "\f" in text else r"\n{3,}"
|
129
|
+
content = [self.clean(x) for x in re.split(split, text)]
|
130
|
+
else:
|
131
|
+
content = self.clean(text)
|
132
|
+
|
133
|
+
# Text tokenization enabled
|
134
|
+
if isinstance(content, list):
|
135
|
+
# Remove empty strings
|
136
|
+
content = [x for x in content if x]
|
137
|
+
return " ".join(content) if self.join else content
|
138
|
+
|
139
|
+
# Default method that returns clean text
|
140
|
+
return content
|
141
|
+
|
142
|
+
def clean(self, text):
|
143
|
+
"""
|
144
|
+
Applies a series of rules to clean text.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
text: input text
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
clean text
|
151
|
+
"""
|
152
|
+
|
153
|
+
# Text cleaning disabled, return original text
|
154
|
+
if not self.cleantext:
|
155
|
+
return text
|
156
|
+
|
157
|
+
# Collapse and remove excess whitespace
|
158
|
+
text = re.sub(r" +", " ", text)
|
159
|
+
text = text.strip()
|
160
|
+
|
161
|
+
# If minlength enabled, require at least minlength chars
|
162
|
+
return text if not self.minlength or len(text) >= self.minlength else None
|
163
|
+
|
164
|
+
def createchunker(self, chunker, **kwargs):
|
165
|
+
"""
|
166
|
+
Creates a new third-party chunker
|
167
|
+
|
168
|
+
Args:
|
169
|
+
chunker: name of chunker to create
|
170
|
+
kwargs: additional keyword arguments
|
171
|
+
|
172
|
+
Returns:
|
173
|
+
new chunker
|
174
|
+
"""
|
175
|
+
|
176
|
+
# Resolve and create a third-party chunker
|
177
|
+
chunker = f"{chunker[0].upper() + chunker[1:]}Chunker"
|
178
|
+
return getattr(chonkie, chunker)(**kwargs)
|