spark-nlp 2.6.3rc1__py2.py3-none-any.whl → 6.2.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (329) hide show
  1. com/johnsnowlabs/ml/__init__.py +0 -0
  2. com/johnsnowlabs/ml/ai/__init__.py +10 -0
  3. com/johnsnowlabs/nlp/__init__.py +4 -2
  4. spark_nlp-6.2.1.dist-info/METADATA +362 -0
  5. spark_nlp-6.2.1.dist-info/RECORD +292 -0
  6. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/WHEEL +1 -1
  7. sparknlp/__init__.py +281 -27
  8. sparknlp/annotation.py +137 -6
  9. sparknlp/annotation_audio.py +61 -0
  10. sparknlp/annotation_image.py +82 -0
  11. sparknlp/annotator/__init__.py +93 -0
  12. sparknlp/annotator/audio/__init__.py +16 -0
  13. sparknlp/annotator/audio/hubert_for_ctc.py +188 -0
  14. sparknlp/annotator/audio/wav2vec2_for_ctc.py +161 -0
  15. sparknlp/annotator/audio/whisper_for_ctc.py +251 -0
  16. sparknlp/annotator/chunk2_doc.py +85 -0
  17. sparknlp/annotator/chunker.py +137 -0
  18. sparknlp/annotator/classifier_dl/__init__.py +61 -0
  19. sparknlp/annotator/classifier_dl/albert_for_multiple_choice.py +161 -0
  20. sparknlp/annotator/classifier_dl/albert_for_question_answering.py +172 -0
  21. sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +201 -0
  22. sparknlp/annotator/classifier_dl/albert_for_token_classification.py +179 -0
  23. sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py +211 -0
  24. sparknlp/annotator/classifier_dl/bart_for_zero_shot_classification.py +225 -0
  25. sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py +161 -0
  26. sparknlp/annotator/classifier_dl/bert_for_question_answering.py +168 -0
  27. sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +202 -0
  28. sparknlp/annotator/classifier_dl/bert_for_token_classification.py +177 -0
  29. sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +212 -0
  30. sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +168 -0
  31. sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +205 -0
  32. sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +173 -0
  33. sparknlp/annotator/classifier_dl/camembert_for_zero_shot_classification.py +202 -0
  34. sparknlp/annotator/classifier_dl/classifier_dl.py +320 -0
  35. sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +168 -0
  36. sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +198 -0
  37. sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +175 -0
  38. sparknlp/annotator/classifier_dl/deberta_for_zero_shot_classification.py +193 -0
  39. sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +168 -0
  40. sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +201 -0
  41. sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +175 -0
  42. sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +211 -0
  43. sparknlp/annotator/classifier_dl/distilbert_for_multiple_choice.py +161 -0
  44. sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +168 -0
  45. sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +201 -0
  46. sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +176 -0
  47. sparknlp/annotator/classifier_dl/mpnet_for_question_answering.py +148 -0
  48. sparknlp/annotator/classifier_dl/mpnet_for_sequence_classification.py +188 -0
  49. sparknlp/annotator/classifier_dl/mpnet_for_token_classification.py +173 -0
  50. sparknlp/annotator/classifier_dl/multi_classifier_dl.py +395 -0
  51. sparknlp/annotator/classifier_dl/roberta_for_multiple_choice.py +161 -0
  52. sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +168 -0
  53. sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +201 -0
  54. sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +189 -0
  55. sparknlp/annotator/classifier_dl/roberta_for_zero_shot_classification.py +225 -0
  56. sparknlp/annotator/classifier_dl/sentiment_dl.py +378 -0
  57. sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +170 -0
  58. sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py +149 -0
  59. sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +168 -0
  60. sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +201 -0
  61. sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +173 -0
  62. sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py +225 -0
  63. sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +201 -0
  64. sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +176 -0
  65. sparknlp/annotator/cleaners/__init__.py +15 -0
  66. sparknlp/annotator/cleaners/cleaner.py +202 -0
  67. sparknlp/annotator/cleaners/extractor.py +191 -0
  68. sparknlp/annotator/coref/__init__.py +1 -0
  69. sparknlp/annotator/coref/spanbert_coref.py +221 -0
  70. sparknlp/annotator/cv/__init__.py +29 -0
  71. sparknlp/annotator/cv/blip_for_question_answering.py +172 -0
  72. sparknlp/annotator/cv/clip_for_zero_shot_classification.py +193 -0
  73. sparknlp/annotator/cv/convnext_for_image_classification.py +269 -0
  74. sparknlp/annotator/cv/florence2_transformer.py +180 -0
  75. sparknlp/annotator/cv/gemma3_for_multimodal.py +346 -0
  76. sparknlp/annotator/cv/internvl_for_multimodal.py +280 -0
  77. sparknlp/annotator/cv/janus_for_multimodal.py +351 -0
  78. sparknlp/annotator/cv/llava_for_multimodal.py +328 -0
  79. sparknlp/annotator/cv/mllama_for_multimodal.py +340 -0
  80. sparknlp/annotator/cv/paligemma_for_multimodal.py +308 -0
  81. sparknlp/annotator/cv/phi3_vision_for_multimodal.py +328 -0
  82. sparknlp/annotator/cv/qwen2vl_transformer.py +332 -0
  83. sparknlp/annotator/cv/smolvlm_transformer.py +426 -0
  84. sparknlp/annotator/cv/swin_for_image_classification.py +242 -0
  85. sparknlp/annotator/cv/vision_encoder_decoder_for_image_captioning.py +240 -0
  86. sparknlp/annotator/cv/vit_for_image_classification.py +217 -0
  87. sparknlp/annotator/dataframe_optimizer.py +216 -0
  88. sparknlp/annotator/date2_chunk.py +88 -0
  89. sparknlp/annotator/dependency/__init__.py +17 -0
  90. sparknlp/annotator/dependency/dependency_parser.py +294 -0
  91. sparknlp/annotator/dependency/typed_dependency_parser.py +318 -0
  92. sparknlp/annotator/document_character_text_splitter.py +228 -0
  93. sparknlp/annotator/document_normalizer.py +235 -0
  94. sparknlp/annotator/document_token_splitter.py +175 -0
  95. sparknlp/annotator/document_token_splitter_test.py +85 -0
  96. sparknlp/annotator/embeddings/__init__.py +45 -0
  97. sparknlp/annotator/embeddings/albert_embeddings.py +230 -0
  98. sparknlp/annotator/embeddings/auto_gguf_embeddings.py +539 -0
  99. sparknlp/annotator/embeddings/bert_embeddings.py +208 -0
  100. sparknlp/annotator/embeddings/bert_sentence_embeddings.py +224 -0
  101. sparknlp/annotator/embeddings/bge_embeddings.py +199 -0
  102. sparknlp/annotator/embeddings/camembert_embeddings.py +210 -0
  103. sparknlp/annotator/embeddings/chunk_embeddings.py +149 -0
  104. sparknlp/annotator/embeddings/deberta_embeddings.py +208 -0
  105. sparknlp/annotator/embeddings/distil_bert_embeddings.py +221 -0
  106. sparknlp/annotator/embeddings/doc2vec.py +352 -0
  107. sparknlp/annotator/embeddings/e5_embeddings.py +195 -0
  108. sparknlp/annotator/embeddings/e5v_embeddings.py +138 -0
  109. sparknlp/annotator/embeddings/elmo_embeddings.py +251 -0
  110. sparknlp/annotator/embeddings/instructor_embeddings.py +204 -0
  111. sparknlp/annotator/embeddings/longformer_embeddings.py +211 -0
  112. sparknlp/annotator/embeddings/minilm_embeddings.py +189 -0
  113. sparknlp/annotator/embeddings/mpnet_embeddings.py +192 -0
  114. sparknlp/annotator/embeddings/mxbai_embeddings.py +184 -0
  115. sparknlp/annotator/embeddings/nomic_embeddings.py +181 -0
  116. sparknlp/annotator/embeddings/roberta_embeddings.py +225 -0
  117. sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +191 -0
  118. sparknlp/annotator/embeddings/sentence_embeddings.py +134 -0
  119. sparknlp/annotator/embeddings/snowflake_embeddings.py +202 -0
  120. sparknlp/annotator/embeddings/uae_embeddings.py +211 -0
  121. sparknlp/annotator/embeddings/universal_sentence_encoder.py +211 -0
  122. sparknlp/annotator/embeddings/word2vec.py +353 -0
  123. sparknlp/annotator/embeddings/word_embeddings.py +385 -0
  124. sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +225 -0
  125. sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +194 -0
  126. sparknlp/annotator/embeddings/xlnet_embeddings.py +227 -0
  127. sparknlp/annotator/er/__init__.py +16 -0
  128. sparknlp/annotator/er/entity_ruler.py +267 -0
  129. sparknlp/annotator/graph_extraction.py +368 -0
  130. sparknlp/annotator/keyword_extraction/__init__.py +16 -0
  131. sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +270 -0
  132. sparknlp/annotator/ld_dl/__init__.py +16 -0
  133. sparknlp/annotator/ld_dl/language_detector_dl.py +199 -0
  134. sparknlp/annotator/lemmatizer.py +250 -0
  135. sparknlp/annotator/matcher/__init__.py +20 -0
  136. sparknlp/annotator/matcher/big_text_matcher.py +272 -0
  137. sparknlp/annotator/matcher/date_matcher.py +303 -0
  138. sparknlp/annotator/matcher/multi_date_matcher.py +109 -0
  139. sparknlp/annotator/matcher/regex_matcher.py +221 -0
  140. sparknlp/annotator/matcher/text_matcher.py +290 -0
  141. sparknlp/annotator/n_gram_generator.py +141 -0
  142. sparknlp/annotator/ner/__init__.py +21 -0
  143. sparknlp/annotator/ner/ner_approach.py +94 -0
  144. sparknlp/annotator/ner/ner_converter.py +148 -0
  145. sparknlp/annotator/ner/ner_crf.py +397 -0
  146. sparknlp/annotator/ner/ner_dl.py +591 -0
  147. sparknlp/annotator/ner/ner_dl_graph_checker.py +293 -0
  148. sparknlp/annotator/ner/ner_overwriter.py +166 -0
  149. sparknlp/annotator/ner/zero_shot_ner_model.py +173 -0
  150. sparknlp/annotator/normalizer.py +230 -0
  151. sparknlp/annotator/openai/__init__.py +16 -0
  152. sparknlp/annotator/openai/openai_completion.py +349 -0
  153. sparknlp/annotator/openai/openai_embeddings.py +106 -0
  154. sparknlp/annotator/param/__init__.py +17 -0
  155. sparknlp/annotator/param/classifier_encoder.py +98 -0
  156. sparknlp/annotator/param/evaluation_dl_params.py +130 -0
  157. sparknlp/annotator/pos/__init__.py +16 -0
  158. sparknlp/annotator/pos/perceptron.py +263 -0
  159. sparknlp/annotator/sentence/__init__.py +17 -0
  160. sparknlp/annotator/sentence/sentence_detector.py +290 -0
  161. sparknlp/annotator/sentence/sentence_detector_dl.py +467 -0
  162. sparknlp/annotator/sentiment/__init__.py +17 -0
  163. sparknlp/annotator/sentiment/sentiment_detector.py +208 -0
  164. sparknlp/annotator/sentiment/vivekn_sentiment.py +242 -0
  165. sparknlp/annotator/seq2seq/__init__.py +35 -0
  166. sparknlp/annotator/seq2seq/auto_gguf_model.py +304 -0
  167. sparknlp/annotator/seq2seq/auto_gguf_reranker.py +334 -0
  168. sparknlp/annotator/seq2seq/auto_gguf_vision_model.py +336 -0
  169. sparknlp/annotator/seq2seq/bart_transformer.py +420 -0
  170. sparknlp/annotator/seq2seq/cohere_transformer.py +357 -0
  171. sparknlp/annotator/seq2seq/cpm_transformer.py +321 -0
  172. sparknlp/annotator/seq2seq/gpt2_transformer.py +363 -0
  173. sparknlp/annotator/seq2seq/llama2_transformer.py +343 -0
  174. sparknlp/annotator/seq2seq/llama3_transformer.py +381 -0
  175. sparknlp/annotator/seq2seq/m2m100_transformer.py +392 -0
  176. sparknlp/annotator/seq2seq/marian_transformer.py +374 -0
  177. sparknlp/annotator/seq2seq/mistral_transformer.py +348 -0
  178. sparknlp/annotator/seq2seq/nllb_transformer.py +420 -0
  179. sparknlp/annotator/seq2seq/olmo_transformer.py +326 -0
  180. sparknlp/annotator/seq2seq/phi2_transformer.py +326 -0
  181. sparknlp/annotator/seq2seq/phi3_transformer.py +330 -0
  182. sparknlp/annotator/seq2seq/phi4_transformer.py +387 -0
  183. sparknlp/annotator/seq2seq/qwen_transformer.py +340 -0
  184. sparknlp/annotator/seq2seq/starcoder_transformer.py +335 -0
  185. sparknlp/annotator/seq2seq/t5_transformer.py +425 -0
  186. sparknlp/annotator/similarity/__init__.py +0 -0
  187. sparknlp/annotator/similarity/document_similarity_ranker.py +379 -0
  188. sparknlp/annotator/spell_check/__init__.py +18 -0
  189. sparknlp/annotator/spell_check/context_spell_checker.py +911 -0
  190. sparknlp/annotator/spell_check/norvig_sweeting.py +358 -0
  191. sparknlp/annotator/spell_check/symmetric_delete.py +299 -0
  192. sparknlp/annotator/stemmer.py +79 -0
  193. sparknlp/annotator/stop_words_cleaner.py +190 -0
  194. sparknlp/annotator/tf_ner_dl_graph_builder.py +179 -0
  195. sparknlp/annotator/token/__init__.py +19 -0
  196. sparknlp/annotator/token/chunk_tokenizer.py +118 -0
  197. sparknlp/annotator/token/recursive_tokenizer.py +205 -0
  198. sparknlp/annotator/token/regex_tokenizer.py +208 -0
  199. sparknlp/annotator/token/tokenizer.py +561 -0
  200. sparknlp/annotator/token2_chunk.py +76 -0
  201. sparknlp/annotator/ws/__init__.py +16 -0
  202. sparknlp/annotator/ws/word_segmenter.py +429 -0
  203. sparknlp/base/__init__.py +30 -0
  204. sparknlp/base/audio_assembler.py +95 -0
  205. sparknlp/base/doc2_chunk.py +169 -0
  206. sparknlp/base/document_assembler.py +164 -0
  207. sparknlp/base/embeddings_finisher.py +201 -0
  208. sparknlp/base/finisher.py +217 -0
  209. sparknlp/base/gguf_ranking_finisher.py +234 -0
  210. sparknlp/base/graph_finisher.py +125 -0
  211. sparknlp/base/has_recursive_fit.py +24 -0
  212. sparknlp/base/has_recursive_transform.py +22 -0
  213. sparknlp/base/image_assembler.py +172 -0
  214. sparknlp/base/light_pipeline.py +429 -0
  215. sparknlp/base/multi_document_assembler.py +164 -0
  216. sparknlp/base/prompt_assembler.py +207 -0
  217. sparknlp/base/recursive_pipeline.py +107 -0
  218. sparknlp/base/table_assembler.py +145 -0
  219. sparknlp/base/token_assembler.py +124 -0
  220. sparknlp/common/__init__.py +26 -0
  221. sparknlp/common/annotator_approach.py +41 -0
  222. sparknlp/common/annotator_model.py +47 -0
  223. sparknlp/common/annotator_properties.py +114 -0
  224. sparknlp/common/annotator_type.py +38 -0
  225. sparknlp/common/completion_post_processing.py +37 -0
  226. sparknlp/common/coverage_result.py +22 -0
  227. sparknlp/common/match_strategy.py +33 -0
  228. sparknlp/common/properties.py +1298 -0
  229. sparknlp/common/read_as.py +33 -0
  230. sparknlp/common/recursive_annotator_approach.py +35 -0
  231. sparknlp/common/storage.py +149 -0
  232. sparknlp/common/utils.py +39 -0
  233. sparknlp/functions.py +315 -5
  234. sparknlp/internal/__init__.py +1199 -0
  235. sparknlp/internal/annotator_java_ml.py +32 -0
  236. sparknlp/internal/annotator_transformer.py +37 -0
  237. sparknlp/internal/extended_java_wrapper.py +63 -0
  238. sparknlp/internal/params_getters_setters.py +71 -0
  239. sparknlp/internal/recursive.py +70 -0
  240. sparknlp/logging/__init__.py +15 -0
  241. sparknlp/logging/comet.py +467 -0
  242. sparknlp/partition/__init__.py +16 -0
  243. sparknlp/partition/partition.py +244 -0
  244. sparknlp/partition/partition_properties.py +902 -0
  245. sparknlp/partition/partition_transformer.py +200 -0
  246. sparknlp/pretrained/__init__.py +17 -0
  247. sparknlp/pretrained/pretrained_pipeline.py +158 -0
  248. sparknlp/pretrained/resource_downloader.py +216 -0
  249. sparknlp/pretrained/utils.py +35 -0
  250. sparknlp/reader/__init__.py +15 -0
  251. sparknlp/reader/enums.py +19 -0
  252. sparknlp/reader/pdf_to_text.py +190 -0
  253. sparknlp/reader/reader2doc.py +124 -0
  254. sparknlp/reader/reader2image.py +136 -0
  255. sparknlp/reader/reader2table.py +44 -0
  256. sparknlp/reader/reader_assembler.py +159 -0
  257. sparknlp/reader/sparknlp_reader.py +461 -0
  258. sparknlp/training/__init__.py +20 -0
  259. sparknlp/training/_tf_graph_builders/__init__.py +0 -0
  260. sparknlp/training/_tf_graph_builders/graph_builders.py +299 -0
  261. sparknlp/training/_tf_graph_builders/ner_dl/__init__.py +0 -0
  262. sparknlp/training/_tf_graph_builders/ner_dl/create_graph.py +41 -0
  263. sparknlp/training/_tf_graph_builders/ner_dl/dataset_encoder.py +78 -0
  264. sparknlp/training/_tf_graph_builders/ner_dl/ner_model.py +521 -0
  265. sparknlp/training/_tf_graph_builders/ner_dl/ner_model_saver.py +62 -0
  266. sparknlp/training/_tf_graph_builders/ner_dl/sentence_grouper.py +28 -0
  267. sparknlp/training/_tf_graph_builders/tf2contrib/__init__.py +36 -0
  268. sparknlp/training/_tf_graph_builders/tf2contrib/core_rnn_cell.py +385 -0
  269. sparknlp/training/_tf_graph_builders/tf2contrib/fused_rnn_cell.py +183 -0
  270. sparknlp/training/_tf_graph_builders/tf2contrib/gru_ops.py +235 -0
  271. sparknlp/training/_tf_graph_builders/tf2contrib/lstm_ops.py +665 -0
  272. sparknlp/training/_tf_graph_builders/tf2contrib/rnn.py +245 -0
  273. sparknlp/training/_tf_graph_builders/tf2contrib/rnn_cell.py +4006 -0
  274. sparknlp/training/_tf_graph_builders_1x/__init__.py +0 -0
  275. sparknlp/training/_tf_graph_builders_1x/graph_builders.py +277 -0
  276. sparknlp/training/_tf_graph_builders_1x/ner_dl/__init__.py +0 -0
  277. sparknlp/training/_tf_graph_builders_1x/ner_dl/create_graph.py +34 -0
  278. sparknlp/training/_tf_graph_builders_1x/ner_dl/dataset_encoder.py +78 -0
  279. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model.py +532 -0
  280. sparknlp/training/_tf_graph_builders_1x/ner_dl/ner_model_saver.py +62 -0
  281. sparknlp/training/_tf_graph_builders_1x/ner_dl/sentence_grouper.py +28 -0
  282. sparknlp/training/conll.py +150 -0
  283. sparknlp/training/conllu.py +103 -0
  284. sparknlp/training/pos.py +103 -0
  285. sparknlp/training/pub_tator.py +76 -0
  286. sparknlp/training/spacy_to_annotation.py +57 -0
  287. sparknlp/training/tfgraphs.py +5 -0
  288. sparknlp/upload_to_hub.py +149 -0
  289. sparknlp/util.py +51 -5
  290. com/__init__.pyc +0 -0
  291. com/__pycache__/__init__.cpython-36.pyc +0 -0
  292. com/johnsnowlabs/__init__.pyc +0 -0
  293. com/johnsnowlabs/__pycache__/__init__.cpython-36.pyc +0 -0
  294. com/johnsnowlabs/nlp/__init__.pyc +0 -0
  295. com/johnsnowlabs/nlp/__pycache__/__init__.cpython-36.pyc +0 -0
  296. spark_nlp-2.6.3rc1.dist-info/METADATA +0 -36
  297. spark_nlp-2.6.3rc1.dist-info/RECORD +0 -48
  298. sparknlp/__init__.pyc +0 -0
  299. sparknlp/__pycache__/__init__.cpython-36.pyc +0 -0
  300. sparknlp/__pycache__/annotation.cpython-36.pyc +0 -0
  301. sparknlp/__pycache__/annotator.cpython-36.pyc +0 -0
  302. sparknlp/__pycache__/base.cpython-36.pyc +0 -0
  303. sparknlp/__pycache__/common.cpython-36.pyc +0 -0
  304. sparknlp/__pycache__/embeddings.cpython-36.pyc +0 -0
  305. sparknlp/__pycache__/functions.cpython-36.pyc +0 -0
  306. sparknlp/__pycache__/internal.cpython-36.pyc +0 -0
  307. sparknlp/__pycache__/pretrained.cpython-36.pyc +0 -0
  308. sparknlp/__pycache__/storage.cpython-36.pyc +0 -0
  309. sparknlp/__pycache__/training.cpython-36.pyc +0 -0
  310. sparknlp/__pycache__/util.cpython-36.pyc +0 -0
  311. sparknlp/annotation.pyc +0 -0
  312. sparknlp/annotator.py +0 -3006
  313. sparknlp/annotator.pyc +0 -0
  314. sparknlp/base.py +0 -347
  315. sparknlp/base.pyc +0 -0
  316. sparknlp/common.py +0 -193
  317. sparknlp/common.pyc +0 -0
  318. sparknlp/embeddings.py +0 -40
  319. sparknlp/embeddings.pyc +0 -0
  320. sparknlp/internal.py +0 -288
  321. sparknlp/internal.pyc +0 -0
  322. sparknlp/pretrained.py +0 -123
  323. sparknlp/pretrained.pyc +0 -0
  324. sparknlp/storage.py +0 -32
  325. sparknlp/storage.pyc +0 -0
  326. sparknlp/training.py +0 -62
  327. sparknlp/training.pyc +0 -0
  328. sparknlp/util.pyc +0 -0
  329. {spark_nlp-2.6.3rc1.dist-info → spark_nlp-6.2.1.dist-info}/top_level.txt +0 -0
sparknlp/annotator.pyc DELETED
Binary file
sparknlp/base.py DELETED
@@ -1,347 +0,0 @@
1
- from abc import ABC
2
-
3
- from pyspark import keyword_only
4
- from pyspark.ml.wrapper import JavaEstimator
5
- from pyspark.ml.param.shared import Param, Params, TypeConverters
6
- from pyspark.ml.pipeline import Pipeline, PipelineModel, Estimator, Transformer
7
- from sparknlp.common import AnnotatorProperties
8
- from sparknlp.internal import AnnotatorTransformer, RecursiveEstimator, RecursiveTransformer
9
-
10
- from sparknlp.annotation import Annotation
11
- import sparknlp.internal as _internal
12
-
13
-
14
- class LightPipeline:
15
- def __init__(self, pipelineModel, parse_embeddings=False):
16
- self.pipeline_model = pipelineModel
17
- self._lightPipeline = _internal._LightPipeline(pipelineModel, parse_embeddings).apply()
18
-
19
- @staticmethod
20
- def _annotation_from_java(java_annotations):
21
- annotations = []
22
- for annotation in java_annotations:
23
- annotations.append(Annotation(annotation.annotatorType(),
24
- annotation.begin(),
25
- annotation.end(),
26
- annotation.result(),
27
- annotation.metadata(),
28
- annotation.embeddings
29
- )
30
- )
31
- return annotations
32
-
33
- def fullAnnotate(self, target):
34
- result = []
35
- if type(target) is str:
36
- target = [target]
37
- for row in self._lightPipeline.fullAnnotateJava(target):
38
- kas = {}
39
- for atype, annotations in row.items():
40
- kas[atype] = self._annotation_from_java(annotations)
41
- result.append(kas)
42
- return result
43
-
44
- def annotate(self, target):
45
-
46
- def reformat(annotations):
47
- return {k: list(v) for k, v in annotations.items()}
48
-
49
- annotations = self._lightPipeline.annotateJava(target)
50
-
51
- if type(target) is str:
52
- result = reformat(annotations)
53
- elif type(target) is list:
54
- result = list(map(lambda a: reformat(a), list(annotations)))
55
- else:
56
- raise TypeError("target for annotation may be 'str' or 'list'")
57
-
58
- return result
59
-
60
- def transform(self, dataframe):
61
- return self.pipeline_model.transform(dataframe)
62
-
63
- def setIgnoreUnsupported(self, value):
64
- self._lightPipeline.setIgnoreUnsupported(value)
65
- return self
66
-
67
- def getIgnoreUnsupported(self):
68
- return self._lightPipeline.getIgnoreUnsupported()
69
-
70
-
71
- class RecursivePipeline(Pipeline, JavaEstimator):
72
- @keyword_only
73
- def __init__(self, *args, **kwargs):
74
- super(RecursivePipeline, self).__init__(*args, **kwargs)
75
- self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.RecursivePipeline", self.uid)
76
- kwargs = self._input_kwargs
77
- self.setParams(**kwargs)
78
-
79
- def _fit(self, dataset):
80
- stages = self.getStages()
81
- for stage in stages:
82
- if not (isinstance(stage, Estimator) or isinstance(stage, Transformer)):
83
- raise TypeError(
84
- "Cannot recognize a pipeline stage of type %s." % type(stage))
85
- indexOfLastEstimator = -1
86
- for i, stage in enumerate(stages):
87
- if isinstance(stage, Estimator):
88
- indexOfLastEstimator = i
89
- transformers = []
90
- for i, stage in enumerate(stages):
91
- if i <= indexOfLastEstimator:
92
- if isinstance(stage, Transformer):
93
- transformers.append(stage)
94
- dataset = stage.transform(dataset)
95
- elif isinstance(stage, RecursiveEstimator):
96
- model = stage.fit(dataset, pipeline=PipelineModel(transformers))
97
- transformers.append(model)
98
- if i < indexOfLastEstimator:
99
- dataset = model.transform(dataset)
100
- else:
101
- model = stage.fit(dataset)
102
- transformers.append(model)
103
- if i < indexOfLastEstimator:
104
- dataset = model.transform(dataset)
105
- else:
106
- transformers.append(stage)
107
- return PipelineModel(transformers)
108
-
109
-
110
- class RecursivePipelineModel(PipelineModel):
111
-
112
- def __init__(self, pipeline_model):
113
- super(PipelineModel, self).__init__()
114
- self.stages = pipeline_model.stages
115
-
116
- def _transform(self, dataset):
117
- for t in self.stages:
118
- if isinstance(t, HasRecursiveTransform):
119
- # drops current stage from the recursive pipeline within
120
- dataset = t.transform_recursive(dataset, PipelineModel(self.stages[:-1]))
121
- elif isinstance(t, AnnotatorProperties) and t.getLazyAnnotator():
122
- pass
123
- else:
124
- dataset = t.transform(dataset)
125
- return dataset
126
-
127
-
128
- class HasRecursiveFit(RecursiveEstimator, ABC):
129
- pass
130
-
131
-
132
- class HasRecursiveTransform(RecursiveTransformer):
133
- pass
134
-
135
-
136
- class DocumentAssembler(AnnotatorTransformer):
137
-
138
- inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
139
- outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
140
- idCol = Param(Params._dummy(), "idCol", "column for setting an id to such string in row", typeConverter=TypeConverters.toString)
141
- metadataCol = Param(Params._dummy(), "metadataCol", "String to String map column to use as metadata", typeConverter=TypeConverters.toString)
142
- calculationsCol = Param(Params._dummy(), "calculationsCol", "String to Float vector map column to use as embeddigns and other representations", typeConverter=TypeConverters.toString)
143
- cleanupMode = Param(Params._dummy(), "cleanupMode", "possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full", typeConverter=TypeConverters.toString)
144
- name = 'DocumentAssembler'
145
-
146
- @keyword_only
147
- def __init__(self):
148
- super(DocumentAssembler, self).__init__(classname="com.johnsnowlabs.nlp.DocumentAssembler")
149
- self._setDefault(outputCol="document", cleanupMode='disabled')
150
-
151
- @keyword_only
152
- def setParams(self):
153
- kwargs = self._input_kwargs
154
- return self._set(**kwargs)
155
-
156
- def setInputCol(self, value):
157
- return self._set(inputCol=value)
158
-
159
- def setOutputCol(self, value):
160
- return self._set(outputCol=value)
161
-
162
- def setIdCol(self, value):
163
- return self._set(idCol=value)
164
-
165
- def setMetadataCol(self, value):
166
- return self._set(metadataCol=value)
167
-
168
- def setCalculationsCol(self, value):
169
- return self._set(metadataCol=value)
170
-
171
- def setCleanupMode(self, value):
172
- if value.strip().lower() not in ['disabled', 'inplace', 'inplace_full', 'shrink', 'shrink_full', 'each', 'each_full', 'delete_full']:
173
- raise Exception("Cleanup mode possible values: disabled, inplace, inplace_full, shrink, shrink_full, each, each_full, delete_full")
174
- return self._set(cleanupMode=value)
175
-
176
-
177
- class TokenAssembler(AnnotatorTransformer, AnnotatorProperties):
178
-
179
- name = "TokenAssembler"
180
- preservePosition = Param(Params._dummy(), "preservePosition", "whether to preserve the actual position of the tokens or reduce them to one space", typeConverter=TypeConverters.toBoolean)
181
-
182
- @keyword_only
183
- def __init__(self):
184
- super(TokenAssembler, self).__init__(classname="com.johnsnowlabs.nlp.TokenAssembler")
185
-
186
- @keyword_only
187
- def setParams(self):
188
- kwargs = self._input_kwargs
189
- return self._set(**kwargs)
190
-
191
- def setPreservePosition(self, value):
192
- return self._set(preservePosition=value)
193
-
194
-
195
- class Doc2Chunk(AnnotatorTransformer, AnnotatorProperties):
196
-
197
- chunkCol = Param(Params._dummy(), "chunkCol", "column that contains string. Must be part of DOCUMENT", typeConverter=TypeConverters.toString)
198
- startCol = Param(Params._dummy(), "startCol", "column that has a reference of where chunk begins", typeConverter=TypeConverters.toString)
199
- startColByTokenIndex = Param(Params._dummy(), "startColByTokenIndex", "whether start col is by whitespace tokens", typeConverter=TypeConverters.toBoolean)
200
- isArray = Param(Params._dummy(), "isArray", "whether the chunkCol is an array of strings", typeConverter=TypeConverters.toBoolean)
201
- failOnMissing = Param(Params._dummy(), "failOnMissing", "whether to fail the job if a chunk is not found within document. return empty otherwise", typeConverter=TypeConverters.toBoolean)
202
- lowerCase = Param(Params._dummy(), "lowerCase", "whether to lower case for matching case", typeConverter=TypeConverters.toBoolean)
203
- name = "Doc2Chunk"
204
-
205
- @keyword_only
206
- def __init__(self):
207
- super(Doc2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.Doc2Chunk")
208
- self._setDefault(
209
- isArray=False
210
- )
211
-
212
- @keyword_only
213
- def setParams(self):
214
- kwargs = self._input_kwargs
215
- return self._set(**kwargs)
216
-
217
- def setChunkCol(self, value):
218
- return self._set(chunkCol=value)
219
-
220
- def setIsArray(self, value):
221
- return self._set(isArray=value)
222
-
223
- def setStartCol(self, value):
224
- return self._set(startCol=value)
225
-
226
- def setStartColByTokenIndex(self, value):
227
- return self._set(startColByTokenIndex=value)
228
-
229
- def setFailOnMissing(self, value):
230
- return self._set(failOnMissing=value)
231
-
232
- def setLowerCase(self, value):
233
- return self._set(lowerCase=value)
234
-
235
-
236
- class Chunk2Doc(AnnotatorTransformer, AnnotatorProperties):
237
-
238
- name = "Chunk2Doc"
239
-
240
- @keyword_only
241
- def __init__(self):
242
- super(Chunk2Doc, self).__init__(classname="com.johnsnowlabs.nlp.Chunk2Doc")
243
-
244
- @keyword_only
245
- def setParams(self):
246
- kwargs = self._input_kwargs
247
- return self._set(**kwargs)
248
-
249
-
250
- class Finisher(AnnotatorTransformer):
251
-
252
- inputCols = Param(Params._dummy(), "inputCols", "input annotations", typeConverter=TypeConverters.toListString)
253
- outputCols = Param(Params._dummy(), "outputCols", "output finished annotation cols", typeConverter=TypeConverters.toListString)
254
- valueSplitSymbol = Param(Params._dummy(), "valueSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
255
- annotationSplitSymbol = Param(Params._dummy(), "annotationSplitSymbol", "character separating annotations", typeConverter=TypeConverters.toString)
256
- cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove annotation columns", typeConverter=TypeConverters.toBoolean)
257
- includeMetadata = Param(Params._dummy(), "includeMetadata", "annotation metadata format", typeConverter=TypeConverters.toBoolean)
258
- outputAsArray = Param(Params._dummy(), "outputAsArray", "finisher generates an Array with the results instead of string", typeConverter=TypeConverters.toBoolean)
259
- parseEmbeddingsVectors = Param(Params._dummy(), "parseEmbeddingsVectors", "whether to include embeddings vectors in the process", typeConverter=TypeConverters.toBoolean)
260
-
261
- name = "Finisher"
262
-
263
- @keyword_only
264
- def __init__(self):
265
- super(Finisher, self).__init__(classname="com.johnsnowlabs.nlp.Finisher")
266
- self._setDefault(
267
- cleanAnnotations=True,
268
- includeMetadata=False,
269
- outputAsArray=True,
270
- parseEmbeddingsVectors=False
271
- )
272
-
273
- @keyword_only
274
- def setParams(self):
275
- kwargs = self._input_kwargs
276
- return self._set(**kwargs)
277
-
278
- def setInputCols(self, *value):
279
- if len(value) == 1 and type(value[0]) == list:
280
- return self._set(inputCols=value[0])
281
- else:
282
- return self._set(inputCols=list(value))
283
-
284
- def setOutputCols(self, *value):
285
- if len(value) == 1 and type(value[0]) == list:
286
- return self._set(outputCols=value[0])
287
- else:
288
- return self._set(outputCols=list(value))
289
-
290
- def setValueSplitSymbol(self, value):
291
- return self._set(valueSplitSymbol=value)
292
-
293
- def setAnnotationSplitSymbol(self, value):
294
- return self._set(annotationSplitSymbol=value)
295
-
296
- def setCleanAnnotations(self, value):
297
- return self._set(cleanAnnotations=value)
298
-
299
- def setIncludeMetadata(self, value):
300
- return self._set(includeMetadata=value)
301
-
302
- def setOutputAsArray(self, value):
303
- return self._set(outputAsArray=value)
304
-
305
- def setParseEmbeddingsVectors(self, value):
306
- return self._set(parseEmbeddingsVectors=value)
307
-
308
-
309
- class EmbeddingsFinisher(AnnotatorTransformer):
310
-
311
- inputCols = Param(Params._dummy(), "inputCols", "name of input annotation cols containing embeddings", typeConverter=TypeConverters.toListString)
312
- outputCols = Param(Params._dummy(), "outputCols", "output EmbeddingsFinisher ouput cols", typeConverter=TypeConverters.toListString)
313
- cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove all the existing annotation columns", typeConverter=TypeConverters.toBoolean)
314
- outputAsVector = Param(Params._dummy(), "outputAsVector", "if enabled it will output the embeddings as Vectors instead of arrays", typeConverter=TypeConverters.toBoolean)
315
-
316
- name = "EmbeddingsFinisher"
317
-
318
- @keyword_only
319
- def __init__(self):
320
- super(EmbeddingsFinisher, self).__init__(classname="com.johnsnowlabs.nlp.EmbeddingsFinisher")
321
- self._setDefault(
322
- cleanAnnotations=False,
323
- outputAsVector=False
324
- )
325
-
326
- @keyword_only
327
- def setParams(self):
328
- kwargs = self._input_kwargs
329
- return self._set(**kwargs)
330
-
331
- def setInputCols(self, *value):
332
- if len(value) == 1 and type(value[0]) == list:
333
- return self._set(inputCols=value[0])
334
- else:
335
- return self._set(inputCols=list(value))
336
-
337
- def setOutputCols(self, *value):
338
- if len(value) == 1 and type(value[0]) == list:
339
- return self._set(outputCols=value[0])
340
- else:
341
- return self._set(outputCols=list(value))
342
-
343
- def setCleanAnnotations(self, value):
344
- return self._set(cleanAnnotations=value)
345
-
346
- def setOutputAsVector(self, value):
347
- return self._set(outputAsVector=value)
sparknlp/base.pyc DELETED
Binary file
sparknlp/common.py DELETED
@@ -1,193 +0,0 @@
1
- from pyspark.ml.util import JavaMLWritable
2
- from pyspark.ml.wrapper import JavaModel, JavaEstimator
3
- from pyspark.ml.param.shared import Param, TypeConverters
4
- from pyspark.ml.param import Params
5
- from pyspark import keyword_only
6
- import sparknlp.internal as _internal
7
-
8
-
9
- class AnnotatorProperties(Params):
10
-
11
- inputCols = Param(Params._dummy(),
12
- "inputCols",
13
- "previous annotations columns, if renamed",
14
- typeConverter=TypeConverters.toListString)
15
- outputCol = Param(Params._dummy(),
16
- "outputCol",
17
- "output annotation column. can be left default.",
18
- typeConverter=TypeConverters.toString)
19
- lazyAnnotator = Param(Params._dummy(),
20
- "lazyAnnotator",
21
- "Whether this AnnotatorModel acts as lazy in RecursivePipelines",
22
- typeConverter=TypeConverters.toBoolean
23
- )
24
-
25
- def setInputCols(self, *value):
26
- if len(value) == 1 and type(value[0]) == list:
27
- return self._set(inputCols=value[0])
28
- else:
29
- return self._set(inputCols=list(value))
30
-
31
- def getInputCols(self):
32
- self.getOrDefault(self.inputCols)
33
-
34
- def setOutputCol(self, value):
35
- return self._set(outputCol=value)
36
-
37
- def getOutputCol(self):
38
- self.getOrDefault(self.outputCol)
39
-
40
- def setLazyAnnotator(self, value):
41
- return self._set(lazyAnnotator=value)
42
-
43
- def getLazyAnnotator(self):
44
- self.getOrDefault(self.lazyAnnotator)
45
-
46
-
47
- class AnnotatorModel(JavaModel, _internal.AnnotatorJavaMLReadable, JavaMLWritable, AnnotatorProperties, _internal.ParamsGettersSetters):
48
-
49
- @keyword_only
50
- def setParams(self):
51
- kwargs = self._input_kwargs
52
- return self._set(**kwargs)
53
-
54
- @keyword_only
55
- def __init__(self, classname, java_model=None):
56
- super(AnnotatorModel, self).__init__(java_model=java_model)
57
- if classname and not java_model:
58
- self.__class__._java_class_name = classname
59
- self._java_obj = self._new_java_obj(classname, self.uid)
60
- if java_model is not None:
61
- self._transfer_params_from_java()
62
- self._setDefault(lazyAnnotator=False)
63
-
64
-
65
- class HasEmbeddingsProperties(Params):
66
- dimension = Param(Params._dummy(),
67
- "dimension",
68
- "Number of embedding dimensions",
69
- typeConverter=TypeConverters.toInt)
70
-
71
- def setDimension(self, value):
72
- return self._set(dimension=value)
73
-
74
- def getDimension(self):
75
- return self.getOrDefault(self.dimension)
76
-
77
-
78
- class HasStorageRef:
79
-
80
- storageRef = Param(Params._dummy(), "storageRef",
81
- "unique reference name for identification",
82
- TypeConverters.toString)
83
-
84
- def setStorageRef(self, value):
85
- return self._set(storageRef=value)
86
-
87
- def getStorageRef(self):
88
- return self.getOrDefault("storageRef")
89
-
90
-
91
- class HasCaseSensitiveProperties:
92
- caseSensitive = Param(Params._dummy(),
93
- "caseSensitive",
94
- "whether to ignore case in tokens for embeddings matching",
95
- typeConverter=TypeConverters.toBoolean)
96
-
97
- def setCaseSensitive(self, value):
98
- return self._set(caseSensitive=value)
99
-
100
- def getCaseSensitive(self):
101
- return self.getOrDefault(self.caseSensitive)
102
-
103
-
104
- class HasExcludableStorage:
105
-
106
- includeStorage = Param(Params._dummy(),
107
- "includeStorage",
108
- "whether to include indexed storage in trained model",
109
- typeConverter=TypeConverters.toBoolean)
110
-
111
- def setIncludeStorage(self, value):
112
- return self._set(includeStorage=value)
113
-
114
- def getIncludeStorage(self):
115
- return self.getOrDefault("includeStorage")
116
-
117
-
118
- class HasStorage(HasStorageRef, HasCaseSensitiveProperties, HasExcludableStorage):
119
-
120
- storagePath = Param(Params._dummy(),
121
- "storagePath",
122
- "path to file",
123
- typeConverter=TypeConverters.identity)
124
-
125
- def setStoragePath(self, path, read_as):
126
- return self._set(storagePath=ExternalResource(path, read_as, {}))
127
-
128
- def getStoragePath(self):
129
- return self.getOrDefault("storagePath")
130
-
131
-
132
- class HasStorageModel(HasStorageRef, HasCaseSensitiveProperties, HasExcludableStorage):
133
-
134
- def saveStorage(self, path, spark):
135
- self._transfer_params_to_java()
136
- self._java_obj.saveStorage(path, spark._jsparkSession, False)
137
-
138
- @staticmethod
139
- def loadStorage(path, spark, storage_ref):
140
- raise NotImplementedError("AnnotatorModel with HasStorageModel did not implement 'loadStorage'")
141
-
142
- @staticmethod
143
- def loadStorages(path, spark, storage_ref, databases):
144
- for database in databases:
145
- _internal._StorageHelper(path, spark, database, storage_ref, within_storage=False)
146
-
147
-
148
- class AnnotatorApproach(JavaEstimator, JavaMLWritable, _internal.AnnotatorJavaMLReadable, AnnotatorProperties,
149
- _internal.ParamsGettersSetters):
150
-
151
- @keyword_only
152
- def __init__(self, classname):
153
- _internal.ParamsGettersSetters.__init__(self)
154
- self.__class__._java_class_name = classname
155
- self._java_obj = self._new_java_obj(classname, self.uid)
156
- self._setDefault(lazyAnnotator=False)
157
-
158
- def _create_model(self, java_model):
159
- raise NotImplementedError('Please implement _create_model in %s' % self)
160
-
161
-
162
- class RecursiveAnnotatorApproach(_internal.RecursiveEstimator, JavaMLWritable, _internal.AnnotatorJavaMLReadable, AnnotatorProperties,
163
- _internal.ParamsGettersSetters):
164
- @keyword_only
165
- def __init__(self, classname):
166
- _internal.ParamsGettersSetters.__init__(self)
167
- self.__class__._java_class_name = classname
168
- self._java_obj = self._new_java_obj(classname, self.uid)
169
- self._setDefault(lazyAnnotator=False)
170
-
171
- def _create_model(self, java_model):
172
- raise NotImplementedError('Please implement _create_model in %s' % self)
173
-
174
-
175
- def RegexRule(rule, identifier):
176
- return _internal._RegexRule(rule, identifier).apply()
177
-
178
-
179
- class ReadAs(object):
180
- TEXT = "TEXT"
181
- SPARK = "SPARK"
182
- BINARY = "BINARY"
183
-
184
-
185
- def ExternalResource(path, read_as=ReadAs.TEXT, options={}):
186
- return _internal._ExternalResource(path, read_as, options).apply()
187
-
188
-
189
- class CoverageResult:
190
- def __init__(self, cov_obj):
191
- self.covered = cov_obj.covered()
192
- self.total = cov_obj.total()
193
- self.percentage = cov_obj.percentage()
sparknlp/common.pyc DELETED
Binary file
sparknlp/embeddings.py DELETED
@@ -1,40 +0,0 @@
1
- import sparknlp.internal as _internal
2
-
3
- from pyspark.ml.param import Params
4
- from pyspark import keyword_only
5
- import sys
6
- import threading
7
- import time
8
- import sparknlp.pretrained as _pretrained
9
-
10
-
11
- # DONT REMOVE THIS IMPORT
12
- from sparknlp.annotator import WordEmbeddingsModel
13
- ####
14
-
15
-
16
- class Embeddings:
17
- def __init__(self, embeddings):
18
- self.jembeddings = embeddings
19
-
20
-
21
- class EmbeddingsHelper:
22
- @classmethod
23
- def load(cls, path, spark_session, embeddings_format, embeddings_ref, embeddings_dim, embeddings_casesens=False):
24
- print("Loading started this may take some time")
25
- stop_threads = False
26
- t1 = threading.Thread(target=_pretrained.printProgress, args=(lambda: stop_threads,))
27
- t1.start()
28
- jembeddings = _internal._EmbeddingsHelperLoad(path, spark_session, embeddings_format, embeddings_ref, embeddings_dim, embeddings_casesens).apply()
29
- stop_threads = True
30
- t1.join()
31
- print("Loading done")
32
- return Embeddings(jembeddings)
33
-
34
- @classmethod
35
- def save(cls, path, embeddings, spark_session):
36
- return _internal._EmbeddingsHelperSave(path, embeddings, spark_session).apply()
37
-
38
- @classmethod
39
- def getFromAnnotator(cls, annotator):
40
- return _internal._EmbeddingsHelperFromAnnotator(annotator).apply()
sparknlp/embeddings.pyc DELETED
Binary file