spark-nlp 6.0.2__py2.py3-none-any.whl → 6.0.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spark-nlp might be problematic. Click here for more details.
- {spark_nlp-6.0.2.dist-info → spark_nlp-6.0.3.dist-info}/METADATA +5 -5
- {spark_nlp-6.0.2.dist-info → spark_nlp-6.0.3.dist-info}/RECORD +12 -11
- sparknlp/__init__.py +1 -1
- sparknlp/annotator/embeddings/__init__.py +1 -0
- sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
- sparknlp/internal/__init__.py +8 -0
- sparknlp/partition/partition_properties.py +63 -1
- sparknlp/partition/partition_transformer.py +11 -7
- sparknlp/reader/sparknlp_reader.py +45 -0
- sparknlp/util.py +26 -0
- {spark_nlp-6.0.2.dist-info → spark_nlp-6.0.3.dist-info}/WHEEL +0 -0
- {spark_nlp-6.0.2.dist-info → spark_nlp-6.0.3.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spark-nlp
|
|
3
|
-
Version: 6.0.
|
|
3
|
+
Version: 6.0.3
|
|
4
4
|
Summary: John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment.
|
|
5
5
|
Home-page: https://github.com/JohnSnowLabs/spark-nlp
|
|
6
6
|
Author: John Snow Labs
|
|
@@ -102,7 +102,7 @@ $ java -version
|
|
|
102
102
|
$ conda create -n sparknlp python=3.7 -y
|
|
103
103
|
$ conda activate sparknlp
|
|
104
104
|
# spark-nlp by default is based on pyspark 3.x
|
|
105
|
-
$ pip install spark-nlp==6.0.
|
|
105
|
+
$ pip install spark-nlp==6.0.3 pyspark==3.3.1
|
|
106
106
|
```
|
|
107
107
|
|
|
108
108
|
In Python console or Jupyter `Python3` kernel:
|
|
@@ -168,7 +168,7 @@ For a quick example of using pipelines and models take a look at our official [d
|
|
|
168
168
|
|
|
169
169
|
### Apache Spark Support
|
|
170
170
|
|
|
171
|
-
Spark NLP *6.0.
|
|
171
|
+
Spark NLP *6.0.3* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x
|
|
172
172
|
|
|
173
173
|
| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x |
|
|
174
174
|
|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
|
|
@@ -198,7 +198,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http
|
|
|
198
198
|
|
|
199
199
|
### Databricks Support
|
|
200
200
|
|
|
201
|
-
Spark NLP 6.0.
|
|
201
|
+
Spark NLP 6.0.3 has been tested and is compatible with the following runtimes:
|
|
202
202
|
|
|
203
203
|
| **CPU** | **GPU** |
|
|
204
204
|
|--------------------|--------------------|
|
|
@@ -215,7 +215,7 @@ We are compatible with older runtimes. For a full list check databricks support
|
|
|
215
215
|
|
|
216
216
|
### EMR Support
|
|
217
217
|
|
|
218
|
-
Spark NLP 6.0.
|
|
218
|
+
Spark NLP 6.0.3 has been tested and is compatible with the following EMR releases:
|
|
219
219
|
|
|
220
220
|
| **EMR Release** |
|
|
221
221
|
|--------------------|
|
|
@@ -3,13 +3,13 @@ com/johnsnowlabs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
|
3
3
|
com/johnsnowlabs/ml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
com/johnsnowlabs/ml/ai/__init__.py,sha256=YQiK2M7U4d8y5irPy_HB8ae0mSpqS9583MH44pnKJXc,295
|
|
5
5
|
com/johnsnowlabs/nlp/__init__.py,sha256=DPIVXtONO5xXyOk-HB0-sNiHAcco17NN13zPS_6Uw8c,294
|
|
6
|
-
sparknlp/__init__.py,sha256=
|
|
6
|
+
sparknlp/__init__.py,sha256=wE5XbgWtMI8X1kifJLQ43sFkaUAyfmkZj-wiBtT3YKU,13814
|
|
7
7
|
sparknlp/annotation.py,sha256=I5zOxG5vV2RfPZfqN9enT1i4mo6oBcn3Lrzs37QiOiA,5635
|
|
8
8
|
sparknlp/annotation_audio.py,sha256=iRV_InSVhgvAwSRe9NTbUH9v6OGvTM-FPCpSAKVu0mE,1917
|
|
9
9
|
sparknlp/annotation_image.py,sha256=xhCe8Ko-77XqWVuuYHFrjKqF6zPd8Z-RY_rmZXNwCXU,2547
|
|
10
10
|
sparknlp/functions.py,sha256=4cVRyBjlF1YttcMNs5z7gf9NPW7q9qzGb5KOf44Phgs,12120
|
|
11
11
|
sparknlp/upload_to_hub.py,sha256=toULNLeXK3MmTpmza9dR8R8od2QJEw1eTgBDM-O9_I0,6018
|
|
12
|
-
sparknlp/util.py,sha256=
|
|
12
|
+
sparknlp/util.py,sha256=2Z499Psal-NuEJ4CHQNgHnAJrS73QQNyCzKPo1MavU8,2279
|
|
13
13
|
sparknlp/annotator/__init__.py,sha256=G746SY8dRM_AOf-gaoSKlh7D-2TKGpqqHhGr4XF-b2A,3534
|
|
14
14
|
sparknlp/annotator/chunk2_doc.py,sha256=IJ3_vQHvzjqono90AZUzZ67QSYjwquuMYbN9_HSOVcg,3141
|
|
15
15
|
sparknlp/annotator/chunker.py,sha256=8nz9B7R_mxKxcfJRfKvz2x_T29W3u4izE9k0wfYPzgE,5174
|
|
@@ -102,7 +102,7 @@ sparknlp/annotator/cv/vit_for_image_classification.py,sha256=D2V3pxAd3rBi1817lxV
|
|
|
102
102
|
sparknlp/annotator/dependency/__init__.py,sha256=eV43oXAGaYl2N1XKIEAAZJLNP8gpHm8VxuXDeDlQzR4,774
|
|
103
103
|
sparknlp/annotator/dependency/dependency_parser.py,sha256=SxyvHPp8Hs1Xnm5X1nLTMi095XoQMtfL8pbys15mYAI,11212
|
|
104
104
|
sparknlp/annotator/dependency/typed_dependency_parser.py,sha256=60vPdYkbFk9MPGegg3m9Uik9cMXpMZd8tBvXG39gNww,12456
|
|
105
|
-
sparknlp/annotator/embeddings/__init__.py,sha256=
|
|
105
|
+
sparknlp/annotator/embeddings/__init__.py,sha256=mp1Nb6xooX6YYyJt9xVpYrSPseuJrEpnNKCpp2QiFWo,2466
|
|
106
106
|
sparknlp/annotator/embeddings/albert_embeddings.py,sha256=6Rd1LIn8oFIpq_ALcJh-RUjPEO7Ht8wsHY6JHSFyMkw,9995
|
|
107
107
|
sparknlp/annotator/embeddings/auto_gguf_embeddings.py,sha256=IlqkPGOH2lmZvxEyDSGX-G90DtTFOe2Rvujfbg5zvlU,20185
|
|
108
108
|
sparknlp/annotator/embeddings/bert_embeddings.py,sha256=HVUjkg56kBcpGZCo-fmPG5uatMDF3swW_lnbpy1SgSI,8463
|
|
@@ -114,6 +114,7 @@ sparknlp/annotator/embeddings/deberta_embeddings.py,sha256=_b5nzLb7heFQNN-uT2oBN
|
|
|
114
114
|
sparknlp/annotator/embeddings/distil_bert_embeddings.py,sha256=4pyMCsbvvXYeTGIMVUir9wCDKR_1f_HKtXZrTDO1Thc,9275
|
|
115
115
|
sparknlp/annotator/embeddings/doc2vec.py,sha256=Xk3MdEkXatX9lRgbFbAdnIDrLgIxzUIGWFBZeo9BTq0,13226
|
|
116
116
|
sparknlp/annotator/embeddings/e5_embeddings.py,sha256=Esuvrq9JlogGaSSzFVVDkOFMwgYwFwr17I62ZiCDm0k,7858
|
|
117
|
+
sparknlp/annotator/embeddings/e5v_embeddings.py,sha256=NFHO2nxDcgVzyKQ6yz1BWyqtjwt9QHwlkKbBXFwhsO8,5951
|
|
117
118
|
sparknlp/annotator/embeddings/elmo_embeddings.py,sha256=KV-KPs0Pq_OpPaHsnqBz2k_S7VdzyFZ4632IeFNKqJ8,9858
|
|
118
119
|
sparknlp/annotator/embeddings/instructor_embeddings.py,sha256=CTKmbuBOx_KBM4JM-Y1U5LyR-6rrnpoBGbgGE_axS1c,8670
|
|
119
120
|
sparknlp/annotator/embeddings/longformer_embeddings.py,sha256=jS4fxB5O0-d9ta9VKv8ai-17n5YHt5rML8QxUw7K4Io,8754
|
|
@@ -224,7 +225,7 @@ sparknlp/common/read_as.py,sha256=imxPGwV7jr4Li_acbo0OAHHRGCBbYv-akzEGaBWEfcY,12
|
|
|
224
225
|
sparknlp/common/recursive_annotator_approach.py,sha256=vqugBw22cE3Ff7PIpRlnYFuOlchgL0nM26D8j-NdpqU,1449
|
|
225
226
|
sparknlp/common/storage.py,sha256=D91H3p8EIjNspjqAYu6ephRpCUtdcAir4_PrAbkIQWE,4842
|
|
226
227
|
sparknlp/common/utils.py,sha256=Yne6yYcwKxhOZC-U4qfYoDhWUP_6BIaAjI5X_P_df1E,1306
|
|
227
|
-
sparknlp/internal/__init__.py,sha256=
|
|
228
|
+
sparknlp/internal/__init__.py,sha256=ALwce14xOPRxfAPFhlINH4BVH0w3Mjp4_VWV4hSxNJ8,40146
|
|
228
229
|
sparknlp/internal/annotator_java_ml.py,sha256=UGPoThG0rGXUOXGSQnDzEDW81Mu1s5RPF29v7DFyE3c,1187
|
|
229
230
|
sparknlp/internal/annotator_transformer.py,sha256=fXmc2IWXGybqZpbEU9obmbdBYPc798y42zvSB4tqV9U,1448
|
|
230
231
|
sparknlp/internal/extended_java_wrapper.py,sha256=hwP0133-hDiDf5sBF-P3MtUsuuDj1PpQbtGZQIRwzfk,2240
|
|
@@ -234,8 +235,8 @@ sparknlp/logging/__init__.py,sha256=DoROFF5KLZe4t4Q-OHxqk1nhqbw9NQ-wb64y8icNwgw,
|
|
|
234
235
|
sparknlp/logging/comet.py,sha256=_ZBi9-hlilCAnd4lvdYMWiq4Vqsppv8kow3k0cf-NG4,15958
|
|
235
236
|
sparknlp/partition/__init__.py,sha256=L0w-yv_HnnvoKlSX5MzI2GKHW3RLLfGyq8bgWYVeKjU,749
|
|
236
237
|
sparknlp/partition/partition.py,sha256=GXEAUvOea04Vc_JK0z112cAKFrJ4AEpjLJ8xlzZt6Kw,8551
|
|
237
|
-
sparknlp/partition/partition_properties.py,sha256=
|
|
238
|
-
sparknlp/partition/partition_transformer.py,sha256=
|
|
238
|
+
sparknlp/partition/partition_properties.py,sha256=xhAMhlsTBg-WS6KWDyVbRPwO7IzpowVVhJNR-ZGhvdo,9520
|
|
239
|
+
sparknlp/partition/partition_transformer.py,sha256=lRR1h-IMlHR8M0VeB50SbU39GHHF5PgMaJ42qOriS6A,6855
|
|
239
240
|
sparknlp/pretrained/__init__.py,sha256=GV-x9UBK8F2_IR6zYatrzFcVJtkSUIMbxqWsxRUePmQ,793
|
|
240
241
|
sparknlp/pretrained/pretrained_pipeline.py,sha256=lquxiaABuA68Rmu7csamJPqBoRJqMUO0oNHsmEZDAIs,5740
|
|
241
242
|
sparknlp/pretrained/resource_downloader.py,sha256=8_-rpvO2LsX_Lq4wMPif2ca3RlJZWEabt8pDm2xymiI,7806
|
|
@@ -243,7 +244,7 @@ sparknlp/pretrained/utils.py,sha256=T1MrvW_DaWk_jcOjVLOea0NMFE9w8fe0ZT_5urZ_nEY,
|
|
|
243
244
|
sparknlp/reader/__init__.py,sha256=-Toj3AIBki-zXPpV8ezFTI2LX1yP_rK2bhpoa8nBkTw,685
|
|
244
245
|
sparknlp/reader/enums.py,sha256=MNGug9oJ1BBLM1Pbske13kAabalDzHa2kucF5xzFpHs,770
|
|
245
246
|
sparknlp/reader/pdf_to_text.py,sha256=pI1BBQ44tXn8GIMv--_kZJ3bPP8R9Q1lYejkfhi5pMQ,5739
|
|
246
|
-
sparknlp/reader/sparknlp_reader.py,sha256=
|
|
247
|
+
sparknlp/reader/sparknlp_reader.py,sha256=ybnMlwJaBOVbjDw7ng39jcrshlQzexwq98_PTwVeM8g,16779
|
|
247
248
|
sparknlp/training/__init__.py,sha256=qREi9u-5Vc2VjpL6-XZsyvu5jSEIdIhowW7_kKaqMqo,852
|
|
248
249
|
sparknlp/training/conll.py,sha256=wKBiSTrjc6mjsl7Nyt6B8f4yXsDJkZb-sn8iOjix9cE,6961
|
|
249
250
|
sparknlp/training/conllu.py,sha256=8r3i-tmyrLsyk1DtZ9uo2mMDCWb1yw2Y5W6UsV13MkY,4953
|
|
@@ -274,7 +275,7 @@ sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py,sha256=R4yHFN3
|
|
|
274
275
|
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py,sha256=EoCSdcIjqQ3wv13MAuuWrKV8wyVBP0SbOEW41omHlR0,23189
|
|
275
276
|
sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py,sha256=k5CQ7gKV6HZbZMB8cKLUJuZxoZWlP_DFWdZ--aIDwsc,2356
|
|
276
277
|
sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py,sha256=pAxjWhjazSX8Vg0MFqJiuRVw1IbnQNSs-8Xp26L4nko,870
|
|
277
|
-
spark_nlp-6.0.
|
|
278
|
-
spark_nlp-6.0.
|
|
279
|
-
spark_nlp-6.0.
|
|
280
|
-
spark_nlp-6.0.
|
|
278
|
+
spark_nlp-6.0.3.dist-info/METADATA,sha256=qMqGlXdyZgzm8D3KkC03Jl73y7S_cAh24necRw1G_Qc,19722
|
|
279
|
+
spark_nlp-6.0.3.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
|
|
280
|
+
spark_nlp-6.0.3.dist-info/top_level.txt,sha256=uuytur4pyMRw2H_txNY2ZkaucZHUs22QF8-R03ch_-E,13
|
|
281
|
+
spark_nlp-6.0.3.dist-info/RECORD,,
|
sparknlp/__init__.py
CHANGED
|
@@ -41,3 +41,4 @@ from sparknlp.annotator.embeddings.mxbai_embeddings import *
|
|
|
41
41
|
from sparknlp.annotator.embeddings.snowflake_embeddings import *
|
|
42
42
|
from sparknlp.annotator.embeddings.nomic_embeddings import *
|
|
43
43
|
from sparknlp.annotator.embeddings.auto_gguf_embeddings import *
|
|
44
|
+
from sparknlp.annotator.embeddings.e5v_embeddings import *
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Copyright 2017-2024 John Snow Labs
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from sparknlp.common import *
|
|
16
|
+
|
|
17
|
+
class E5VEmbeddings(AnnotatorModel,
|
|
18
|
+
HasBatchedAnnotateImage,
|
|
19
|
+
HasImageFeatureProperties,
|
|
20
|
+
HasEngine,
|
|
21
|
+
HasRescaleFactor):
|
|
22
|
+
"""Universal multimodal embeddings using the E5-V model (see https://huggingface.co/royokong/e5-v).
|
|
23
|
+
|
|
24
|
+
E5-V bridges the modality gap between different input types (text, image) and demonstrates strong performance in multimodal embeddings, even without fine-tuning. It also supports a single-modality training approach, where the model is trained exclusively on text pairs, often yielding better performance than multimodal training.
|
|
25
|
+
|
|
26
|
+
Pretrained models can be loaded with :meth:`.pretrained` of the companion object:
|
|
27
|
+
|
|
28
|
+
>>> e5vEmbeddings = E5VEmbeddings.pretrained() \
|
|
29
|
+
... .setInputCols(["image_assembler"]) \
|
|
30
|
+
... .setOutputCol("e5v")
|
|
31
|
+
|
|
32
|
+
The default model is ``"e5v_int4"``, if no name is provided.
|
|
33
|
+
|
|
34
|
+
For available pretrained models please see the `Models Hub <https://sparknlp.org/models?task=Question+Answering>`__.
|
|
35
|
+
|
|
36
|
+
====================== ======================
|
|
37
|
+
Input Annotation types Output Annotation type
|
|
38
|
+
====================== ======================
|
|
39
|
+
``IMAGE`` ``SENTENCE_EMBEDDINGS``
|
|
40
|
+
====================== ======================
|
|
41
|
+
|
|
42
|
+
Examples
|
|
43
|
+
--------
|
|
44
|
+
Image + Text Embedding:
|
|
45
|
+
>>> import sparknlp
|
|
46
|
+
>>> from sparknlp.base import *
|
|
47
|
+
>>> from sparknlp.annotator import *
|
|
48
|
+
>>> from pyspark.ml import Pipeline
|
|
49
|
+
>>> image_df = spark.read.format("image").option("dropInvalid", value = True).load(imageFolder)
|
|
50
|
+
>>> imagePrompt = "<|start_header_id|>user<|end_header_id|>\n\n<image>\\nSummary above image in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
|
|
51
|
+
>>> test_df = image_df.withColumn("text", lit(imagePrompt))
|
|
52
|
+
>>> imageAssembler = ImageAssembler() \
|
|
53
|
+
... .setInputCol("image") \
|
|
54
|
+
... .setOutputCol("image_assembler")
|
|
55
|
+
>>> e5vEmbeddings = E5VEmbeddings.pretrained() \
|
|
56
|
+
... .setInputCols(["image_assembler"]) \
|
|
57
|
+
... .setOutputCol("e5v")
|
|
58
|
+
>>> pipeline = Pipeline().setStages([
|
|
59
|
+
... imageAssembler,
|
|
60
|
+
... e5vEmbeddings
|
|
61
|
+
... ])
|
|
62
|
+
>>> result = pipeline.fit(test_df).transform(test_df)
|
|
63
|
+
>>> result.select("e5v.embeddings").show(truncate = False)
|
|
64
|
+
|
|
65
|
+
Text-Only Embedding:
|
|
66
|
+
>>> from sparknlp.util import EmbeddingsDataFrameUtils
|
|
67
|
+
>>> textPrompt = "<|start_header_id|>user<|end_header_id|>\n\n<sent>\\nSummary above sentence in one word: <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"
|
|
68
|
+
>>> textDesc = "A cat sitting in a box."
|
|
69
|
+
>>> nullImageDF = spark.createDataFrame(spark.sparkContext.parallelize([EmbeddingsDataFrameUtils.emptyImageRow]), EmbeddingsDataFrameUtils.imageSchema)
|
|
70
|
+
>>> textDF = nullImageDF.withColumn("text", lit(textPrompt.replace("<sent>", textDesc)))
|
|
71
|
+
>>> e5vEmbeddings = E5VEmbeddings.pretrained() \
|
|
72
|
+
... .setInputCols(["image"]) \
|
|
73
|
+
... .setOutputCol("e5v")
|
|
74
|
+
>>> result = e5vEmbeddings.transform(textDF)
|
|
75
|
+
>>> result.select("e5v.embeddings").show(truncate = False)
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
name = "E5VEmbeddings"
|
|
79
|
+
|
|
80
|
+
inputAnnotatorTypes = [AnnotatorType.IMAGE]
|
|
81
|
+
outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
|
|
82
|
+
|
|
83
|
+
@keyword_only
|
|
84
|
+
def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.E5VEmbeddings", java_model=None):
|
|
85
|
+
"""Initializes the E5VEmbeddings annotator.
|
|
86
|
+
|
|
87
|
+
Parameters
|
|
88
|
+
----------
|
|
89
|
+
classname : str, optional
|
|
90
|
+
The Java class name of the annotator, by default "com.johnsnowlabs.nlp.annotators.embeddings.E5VEmbeddings"
|
|
91
|
+
java_model : Optional[java.lang.Object], optional
|
|
92
|
+
A pre-initialized Java model, by default None
|
|
93
|
+
"""
|
|
94
|
+
super(E5VEmbeddings, self).__init__(classname=classname, java_model=java_model)
|
|
95
|
+
self._setDefault()
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def loadSavedModel(folder, spark_session, use_openvino=False):
|
|
99
|
+
"""Loads a locally saved model.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
folder : str
|
|
104
|
+
Folder of the saved model
|
|
105
|
+
spark_session : pyspark.sql.SparkSession
|
|
106
|
+
The current SparkSession
|
|
107
|
+
use_openvino : bool, optional
|
|
108
|
+
Whether to use OpenVINO engine, by default False
|
|
109
|
+
|
|
110
|
+
Returns
|
|
111
|
+
-------
|
|
112
|
+
E5VEmbeddings
|
|
113
|
+
The restored model
|
|
114
|
+
"""
|
|
115
|
+
from sparknlp.internal import _E5VEmbeddingsLoader
|
|
116
|
+
jModel = _E5VEmbeddingsLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
|
|
117
|
+
return E5VEmbeddings(java_model=jModel)
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def pretrained(name="e5v_int4", lang="en", remote_loc=None):
|
|
121
|
+
"""Downloads and loads a pretrained model.
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
name : str, optional
|
|
126
|
+
Name of the pretrained model, by default "e5v_int4"
|
|
127
|
+
lang : str, optional
|
|
128
|
+
Language of the pretrained model, by default "en"
|
|
129
|
+
remote_loc : str, optional
|
|
130
|
+
Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise.
|
|
131
|
+
|
|
132
|
+
Returns
|
|
133
|
+
-------
|
|
134
|
+
E5VEmbeddings
|
|
135
|
+
The restored model
|
|
136
|
+
"""
|
|
137
|
+
from sparknlp.pretrained import ResourceDownloader
|
|
138
|
+
return ResourceDownloader.downloadModel(E5VEmbeddings, name, lang, remote_loc)
|
sparknlp/internal/__init__.py
CHANGED
|
@@ -1165,3 +1165,11 @@ class _Florence2TransformerLoader(ExtendedJavaWrapper):
|
|
|
1165
1165
|
jspark,
|
|
1166
1166
|
use_openvino,
|
|
1167
1167
|
)
|
|
1168
|
+
class _E5VEmbeddingsLoader(ExtendedJavaWrapper):
|
|
1169
|
+
def __init__(self, path, jspark, use_openvino=False):
|
|
1170
|
+
super(_E5VEmbeddingsLoader, self).__init__(
|
|
1171
|
+
"com.johnsnowlabs.nlp.embeddings.E5VEmbeddings.loadSavedModel",
|
|
1172
|
+
path,
|
|
1173
|
+
jspark,
|
|
1174
|
+
use_openvino
|
|
1175
|
+
)
|
|
@@ -254,4 +254,66 @@ class HasTextReaderProperties(Params):
|
|
|
254
254
|
return self._set(threshold=value)
|
|
255
255
|
|
|
256
256
|
def getThreshold(self):
|
|
257
|
-
return self.getOrDefault(self.threshold)
|
|
257
|
+
return self.getOrDefault(self.threshold)
|
|
258
|
+
|
|
259
|
+
class HasChunkerProperties(Params):
|
|
260
|
+
|
|
261
|
+
chunkingStrategy = Param(
|
|
262
|
+
Params._dummy(),
|
|
263
|
+
"chunkingStrategy",
|
|
264
|
+
"Set the chunking strategy",
|
|
265
|
+
typeConverter=TypeConverters.toString
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
def setChunkingStrategy(self, value):
|
|
269
|
+
return self._set(chunkingStrategy=value)
|
|
270
|
+
|
|
271
|
+
maxCharacters = Param(
|
|
272
|
+
Params._dummy(),
|
|
273
|
+
"maxCharacters",
|
|
274
|
+
"Set the maximum number of characters",
|
|
275
|
+
typeConverter=TypeConverters.toInt
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def setMaxCharacters(self, value):
|
|
279
|
+
return self._set(maxCharacters=value)
|
|
280
|
+
|
|
281
|
+
newAfterNChars = Param(
|
|
282
|
+
Params._dummy(),
|
|
283
|
+
"newAfterNChars",
|
|
284
|
+
"Insert a new chunk after N characters",
|
|
285
|
+
typeConverter=TypeConverters.toInt
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
def setNewAfterNChars(self, value):
|
|
289
|
+
return self._set(newAfterNChars=value)
|
|
290
|
+
|
|
291
|
+
overlap = Param(
|
|
292
|
+
Params._dummy(),
|
|
293
|
+
"overlap",
|
|
294
|
+
"Set the number of overlapping characters between chunks",
|
|
295
|
+
typeConverter=TypeConverters.toInt
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
def setOverlap(self, value):
|
|
299
|
+
return self._set(overlap=value)
|
|
300
|
+
|
|
301
|
+
combineTextUnderNChars = Param(
|
|
302
|
+
Params._dummy(),
|
|
303
|
+
"combineTextUnderNChars",
|
|
304
|
+
"Threshold to merge adjacent small sections",
|
|
305
|
+
typeConverter=TypeConverters.toInt
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
def setCombineTextUnderNChars(self, value):
|
|
309
|
+
return self._set(combineTextUnderNChars=value)
|
|
310
|
+
|
|
311
|
+
overlapAll = Param(
|
|
312
|
+
Params._dummy(),
|
|
313
|
+
"overlapAll",
|
|
314
|
+
"Apply overlap context between all sections, not just split chunks",
|
|
315
|
+
typeConverter=TypeConverters.toBoolean
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def setOverlapAll(self, value):
|
|
319
|
+
return self._set(overlapAll=value)
|
|
@@ -15,13 +15,15 @@
|
|
|
15
15
|
from sparknlp.common import *
|
|
16
16
|
from sparknlp.partition.partition_properties import *
|
|
17
17
|
|
|
18
|
+
|
|
18
19
|
class PartitionTransformer(
|
|
19
20
|
AnnotatorModel,
|
|
20
21
|
HasEmailReaderProperties,
|
|
21
22
|
HasExcelReaderProperties,
|
|
22
23
|
HasHTMLReaderProperties,
|
|
23
24
|
HasPowerPointProperties,
|
|
24
|
-
HasTextReaderProperties
|
|
25
|
+
HasTextReaderProperties,
|
|
26
|
+
HasChunkerProperties
|
|
25
27
|
):
|
|
26
28
|
"""
|
|
27
29
|
The PartitionTransformer annotator allows you to use the Partition feature more smoothly
|
|
@@ -162,10 +164,6 @@ class PartitionTransformer(
|
|
|
162
164
|
def getIncludePageBreaks(self):
|
|
163
165
|
return self.getOrDefault(self.includePageBreaks)
|
|
164
166
|
|
|
165
|
-
# def setHeaders(self, headers: Dict[str, str]):
|
|
166
|
-
# self._call_java("setHeadersPython", headers)
|
|
167
|
-
# return self
|
|
168
|
-
|
|
169
167
|
@keyword_only
|
|
170
168
|
def __init__(self, classname="com.johnsnowlabs.partition.PartitionTransformer",
|
|
171
169
|
java_model=None):
|
|
@@ -192,5 +190,11 @@ class PartitionTransformer(
|
|
|
192
190
|
paragraphSplit=DOUBLE_PARAGRAPH_PATTERN,
|
|
193
191
|
shortLineWordThreshold=5,
|
|
194
192
|
maxLineCount=2000,
|
|
195
|
-
threshold=0.1
|
|
196
|
-
|
|
193
|
+
threshold=0.1,
|
|
194
|
+
chunkingStrategy="",
|
|
195
|
+
maxCharacters=100,
|
|
196
|
+
newAfterNChars=-1,
|
|
197
|
+
overlap=0,
|
|
198
|
+
combineTextUnderNChars=0,
|
|
199
|
+
overlapAll=False
|
|
200
|
+
)
|
|
@@ -322,4 +322,49 @@ class SparkNLPReader(ExtendedJavaWrapper):
|
|
|
322
322
|
if not isinstance(docPath, str):
|
|
323
323
|
raise TypeError("docPath must be a string")
|
|
324
324
|
jdf = self._java_obj.txt(docPath)
|
|
325
|
+
return self.getDataFrame(self.spark, jdf)
|
|
326
|
+
|
|
327
|
+
def xml(self, docPath):
|
|
328
|
+
"""Reads XML files and returns a Spark DataFrame.
|
|
329
|
+
|
|
330
|
+
Parameters
|
|
331
|
+
----------
|
|
332
|
+
docPath : str
|
|
333
|
+
Path to an XML file or a directory containing XML files.
|
|
334
|
+
|
|
335
|
+
Returns
|
|
336
|
+
-------
|
|
337
|
+
pyspark.sql.DataFrame
|
|
338
|
+
A DataFrame containing parsed XML content.
|
|
339
|
+
|
|
340
|
+
Examples
|
|
341
|
+
--------
|
|
342
|
+
>>> from sparknlp.reader import SparkNLPReader
|
|
343
|
+
>>> xml_df = SparkNLPReader(spark).xml("home/user/xml-directory")
|
|
344
|
+
|
|
345
|
+
You can use SparkNLP for one line of code
|
|
346
|
+
|
|
347
|
+
>>> import sparknlp
|
|
348
|
+
>>> xml_df = sparknlp.read().xml("home/user/xml-directory")
|
|
349
|
+
>>> xml_df.show(truncate=False)
|
|
350
|
+
+-----------------------------------------------------------+
|
|
351
|
+
|xml |
|
|
352
|
+
+-----------------------------------------------------------+
|
|
353
|
+
|[{Title, John Smith, {elementId -> ..., tag -> title}}] |
|
|
354
|
+
+-----------------------------------------------------------+
|
|
355
|
+
|
|
356
|
+
>>> xml_df.printSchema()
|
|
357
|
+
root
|
|
358
|
+
|-- path: string (nullable = true)
|
|
359
|
+
|-- xml: array (nullable = true)
|
|
360
|
+
| |-- element: struct (containsNull = true)
|
|
361
|
+
| | |-- elementType: string (nullable = true)
|
|
362
|
+
| | |-- content: string (nullable = true)
|
|
363
|
+
| | |-- metadata: map (nullable = true)
|
|
364
|
+
| | | |-- key: string
|
|
365
|
+
| | | |-- value: string (valueContainsNull = true)
|
|
366
|
+
"""
|
|
367
|
+
if not isinstance(docPath, str):
|
|
368
|
+
raise TypeError("docPath must be a string")
|
|
369
|
+
jdf = self._java_obj.xml(docPath)
|
|
325
370
|
return self.getDataFrame(self.spark, jdf)
|
sparknlp/util.py
CHANGED
|
@@ -15,6 +15,9 @@
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
import sparknlp.internal as _internal
|
|
18
|
+
import numpy as np
|
|
19
|
+
from pyspark.sql import Row
|
|
20
|
+
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BinaryType
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
def get_config_path():
|
|
@@ -33,3 +36,26 @@ class CoNLLGenerator:
|
|
|
33
36
|
_internal._CoNLLGeneratorExportFromTargetAndPipeline(*args).apply()
|
|
34
37
|
else:
|
|
35
38
|
raise NotImplementedError(f"No exportConllFiles alternative takes {num_args} parameters")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EmbeddingsDataFrameUtils:
|
|
42
|
+
"""
|
|
43
|
+
Utility for creating DataFrames compatible with multimodal embedding models (e.g., E5VEmbeddings) for text-only scenarios.
|
|
44
|
+
Provides:
|
|
45
|
+
- imageSchema: the expected schema for Spark image DataFrames
|
|
46
|
+
- emptyImageRow: a dummy image row for text-only embedding
|
|
47
|
+
"""
|
|
48
|
+
imageSchema = StructType([
|
|
49
|
+
StructField(
|
|
50
|
+
"image",
|
|
51
|
+
StructType([
|
|
52
|
+
StructField("origin", StringType(), True),
|
|
53
|
+
StructField("height", IntegerType(), True),
|
|
54
|
+
StructField("width", IntegerType(), True),
|
|
55
|
+
StructField("nChannels", IntegerType(), True),
|
|
56
|
+
StructField("mode", IntegerType(), True),
|
|
57
|
+
StructField("data", BinaryType(), True),
|
|
58
|
+
]),
|
|
59
|
+
)
|
|
60
|
+
])
|
|
61
|
+
emptyImageRow = Row(Row("", 0, 0, 0, 0, bytes()))
|
|
File without changes
|
|
File without changes
|