PyPI - SinaTools - Versions diffs - 0.1.38__tar.gz → 0.1.39__tar.gz - Mend

SinaTools 0.1.38tar.gz → 0.1.39tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (144) hide show

{sinatools-0.1.38 → sinatools-0.1.39}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: SinaTools
-Version: 0.1.38
+Version: 0.1.39
 Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
 Home-page: https://github.com/SinaLab/sinatools
 License: MIT license
@@ -13,12 +13,17 @@ Requires-Dist: farasapy
 Requires-Dist: tqdm
 Requires-Dist: requests
 Requires-Dist: pathlib
-Requires-Dist: torch==1.13.0
-Requires-Dist: transformers==4.24.0
-Requires-Dist: torchtext==0.14.0
-Requires-Dist: torchvision==0.14.0
+Requires-Dist: transformers==4.47.1
+Requires-Dist: torchvision==0.20.1
 Requires-Dist: seqeval==1.2.2
 Requires-Dist: natsort==7.1.1
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: summary
 SinaTools
 ======================

{sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: SinaTools
-Version: 0.1.38
+Version: 0.1.39
 Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
 Home-page: https://github.com/SinaLab/sinatools
 License: MIT license
@@ -13,12 +13,17 @@ Requires-Dist: farasapy
 Requires-Dist: tqdm
 Requires-Dist: requests
 Requires-Dist: pathlib
-Requires-Dist: torch==1.13.0
-Requires-Dist: transformers==4.24.0
-Requires-Dist: torchtext==0.14.0
-Requires-Dist: torchvision==0.14.0
+Requires-Dist: transformers==4.47.1
+Requires-Dist: torchvision==0.20.1
 Requires-Dist: seqeval==1.2.2
 Requires-Dist: natsort==7.1.1
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: requires-dist
+Dynamic: summary
 SinaTools
 ======================

sinatools-0.1.39/SinaTools.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,9 @@
+six
+farasapy
+tqdm
+requests
+pathlib
+transformers==4.47.1
+torchvision==0.20.1
+seqeval==1.2.2
+natsort==7.1.1

{sinatools-0.1.38 → sinatools-0.1.39}/setup.py RENAMED Viewed

@@ -18,10 +18,10 @@ requirements = [
     'requests',
     # 'regex',
     'pathlib',
-    'torch==1.13.0',
-    'transformers==4.24.0',
-    'torchtext==0.14.0',
-    'torchvision==0.14.0',
+    # 'torch==2.5.1',
+    'transformers==4.47.1',
+    # 'torchtext==0.14.0',
+    'torchvision==0.20.1',
     'seqeval==1.2.2',
     'natsort==7.1.1'
 ]

sinatools-0.1.39/sinatools/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.39

sinatools-0.1.39/sinatools/environment.yml ADDED Viewed

@@ -0,0 +1,182 @@
+name: dev
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+  - https://repo.anaconda.com/pkgs/main
+  - https://repo.anaconda.com/pkgs/r
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
+  - binutils_impl_linux-64=2.40=h5293946_0
+  - binutils_linux-64=2.40.0=hc2dff05_1
+  - blas=1.0=mkl
+  - brotli-python=1.0.9=py311h6a678d5_8
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.11.26=h06a4308_0
+  - certifi=2024.12.14=py311h06a4308_0
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - cuda-cudart=12.4.127=0
+  - cuda-cupti=12.4.127=0
+  - cuda-libraries=12.4.1=0
+  - cuda-nvrtc=12.4.127=0
+  - cuda-nvtx=12.4.127=0
+  - cuda-opencl=12.4.127=0
+  - cuda-runtime=12.4.1=0
+  - cuda-version=11.7=h6a555f7_3
+  - cudatoolkit=11.7.0=hd8887f6_10
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.13.1=py311h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - fsspec=2024.6.1=py311h06a4308_0
+  - gcc_impl_linux-64=11.2.0=h1234567_1
+  - gcc_linux-64=11.2.0=h5c386dc_1
+  - giflib=5.2.2=h5eee18b_0
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py311hc9b5ff0_0
+  - gnutls=3.6.15=he1e5248_0
+  - gxx_impl_linux-64=11.2.0=h1234567_1
+  - gxx_linux-64=11.2.0=hc2dff05_1
+  - idna=3.7=py311h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - jinja2=3.1.4=py311h06a4308_1
+  - jpeg=9e=h5eee18b_3
+  - kernel-headers_linux-64=3.10.0=h57e8cba_10
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.16=hb9589c4_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - lerc=4.0.0=h6a678d5_0
+  - libabseil=20240116.2=cxx17_h6a678d5_0
+  - libcublas=12.4.5.8=0
+  - libcufft=11.2.1.3=0
+  - libcufile=1.9.1.3=0
+  - libcurand=10.3.5.147=0
+  - libcusolver=11.6.1.9=0
+  - libcusparse=12.3.1.170=0
+  - libdeflate=1.22=h5eee18b_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-devel_linux-64=11.2.0=h1234567_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h5eee18b_3
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnpp=12.2.5.30=0
+  - libnvfatbin=12.4.127=0
+  - libnvjitlink=12.4.127=0
+  - libnvjpeg=12.3.1.117=0
+  - libpng=1.6.39=h5eee18b_0
+  - libprotobuf=4.25.3=he621ea3_0
+  - libstdcxx-devel_linux-64=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=hffd6297_1
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libwebp=1.3.2=h11a3e52_0
+  - libwebp-base=1.3.2=h5eee18b_1
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - markupsafe=2.1.3=py311h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py311h5eee18b_1
+  - mkl_fft=1.3.11=py311h5eee18b_0
+  - mkl_random=1.2.8=py311ha02d727_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.3.0=py311h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.2.1=py311h06a4308_0
+  - numpy=2.0.1=py311h08b1b3b_1
+  - numpy-base=2.0.1=py311hf175353_1
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.5.2=he7f1fd0_0
+  - openssl=3.0.15=h5eee18b_0
+  - pillow=11.0.0=py311hcea889d_1
+  - pip=24.2=py311h06a4308_0
+  - pysocks=1.7.1=py311h06a4308_0
+  - python=3.11.11=he870216_0
+  - pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0
+  - pytorch-cuda=12.4=hc786d27_7
+  - pytorch-mutex=1.0=cuda
+  - pyyaml=6.0.2=py311h5eee18b_0
+  - readline=8.2=h5eee18b_0
+  - requests=2.32.3=py311h06a4308_1
+  - setuptools=75.1.0=py311h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - sysroot_linux-64=2.17=h57e8cba_10
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.14=h39e8969_0
+  - torchaudio=2.5.1=py311_cu124
+  - torchtriton=3.1.0=py311
+  - torchvision=0.20.1=py311_cu124
+  - typing_extensions=4.12.2=py311h06a4308_0
+  - urllib3=2.2.3=py311h06a4308_0
+  - wheel=0.44.0=py311h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.13=h5eee18b_1
+  - zstd=1.5.6=hc292b87_0
+  - pip:
+      - absl-py==2.1.0
+      - accelerate==1.2.1
+      - aiohappyeyeballs==2.4.4
+      - aiohttp==3.11.11
+      - aiosignal==1.3.2
+      - annotated-types==0.7.0
+      - attrs==24.3.0
+      - datasets==3.2.0
+      - deepspeed==0.16.2
+      - dill==0.3.8
+      - einops==0.8.0
+      - flash-attn==2.7.2.post1
+      - frozenlist==1.5.0
+      - grpcio==1.70.0
+      - hjson==3.1.0
+      - huggingface-hub==0.27.0
+      - joblib==1.4.2
+      - markdown==3.7
+      - markdown-it-py==3.0.0
+      - mdurl==0.1.2
+      - mpi4py==4.0.1
+      - msgpack==1.1.0
+      - multidict==6.1.0
+      - multiprocess==0.70.16
+      - natsort==8.4.0
+      - ninja==1.11.1.3
+      - nvidia-ml-py==12.560.30
+      - packaging==24.2
+      - pandas==2.2.3
+      - peft==0.14.0
+      - propcache==0.2.1
+      - protobuf==6.30.0
+      - psutil==6.1.1
+      - py-cpuinfo==9.0.0
+      - pyarrow==18.1.0
+      - pydantic==2.10.4
+      - pydantic-core==2.27.2
+      - pygments==2.18.0
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.2
+      - regex==2024.11.6
+      - rich==13.9.4
+      - safetensors==0.4.5
+      - scikit-learn==1.6.1
+      - scipy==1.15.2
+      - seqeval==1.2.2
+      - six==1.17.0
+      - sympy==1.13.1
+      - tensorboard==2.19.0
+      - tensorboard-data-server==0.7.2
+      - threadpoolctl==3.5.0
+      - tokenizers==0.21.0
+      - tqdm==4.67.1
+      - transformers==4.47.1
+      - trl==0.12.0
+      - tzdata==2024.2
+      - werkzeug==3.1.3
+      - xxhash==3.5.0
+      - yarl==1.18.3

{sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data/datasets.py RENAMED Viewed

@@ -37,7 +37,11 @@ class Token:
         :return: str
         """
         gold_tags = "|".join(self.gold_tag)
-        pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
+        if self.pred_tag:
+            pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
+        else:
+            pred_tags = ""
         if self.gold_tag:
             r = f"{self.text}\t{gold_tags}\t{pred_tags}"
@@ -139,8 +143,8 @@ class NestedTagsDataset(Dataset):
         masks = torch.cat(masks)
         # Pad the tags, do the padding for each tag type
-        tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["<pad>"])(tag)
+        tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["O"])(tag)
                 for tag, vocab in zip(tags, self.vocab.tags[1:])]
         tags = torch.cat(tags)
-        return subwords, tags, tokens, masks, valid_len
+        return subwords, tags, tokens, masks, valid_len

{sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data_format.py RENAMED Viewed

@@ -1,16 +1,30 @@
 from torch.utils.data import DataLoader
-from torchtext.vocab import vocab
 from collections import Counter, namedtuple
 import logging
 import re
 import itertools
 from sinatools.ner.helpers import load_object
-from sinatools.ner.datasets import Token
-from sinatools.utils.tokenizers_words import simple_word_tokenize
+from sinatools.ner.data.datasets import Token
 logger = logging.getLogger(__name__)
+class Vocab:
+    def __init__(self, counter, specials=[]) -> None:
+        self.itos = list(counter.keys()) + specials
+        self.stoi = {s: i for i, s in enumerate(self.itos)}
+        self.word_count = counter
+    def get_itos(self) -> list[str]:
+        return self.itos
+    def get_stoi(self) -> dict[str, int]:
+        return self.stoi
+    def __len__(self):
+        return len(self.itos)
 def conll_to_segments(filename):
     """
     Convert CoNLL files to segments. This return list of segments and each segment is
@@ -60,8 +74,8 @@ def parse_conll_files(data_paths):
     # Generate vocabs for tags and tokens
     tag_vocabs = tag_vocab_by_type(tags)
-    tag_vocabs.insert(0, vocab(Counter(tags)))
-    vocabs = vocabs(tokens=vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
+    tag_vocabs.insert(0, Vocab(Counter(tags)))
+    vocabs = vocabs(tokens=Vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
     return tuple(datasets), vocabs
@@ -72,9 +86,9 @@ def tag_vocab_by_type(tags):
     tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
     for tag_type in tag_types:
-        r = re.compile(".*-" + tag_type)
+        r = re.compile(".*-" + tag_type + "$")
         t = list(filter(r.match, tags)) + ["O"]
-        vocabs.append(vocab(Counter(t), specials=["<pad>"]))
+        vocabs.append(Vocab(Counter(t)))
     return vocabs
@@ -83,13 +97,11 @@ def text2segments(text):
     """
     Convert text to a datasets and index the tokens
     """
-    #dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
-    list_of_tokens = simple_word_tokenize(text)
-    dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
+    dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
     tokens = [token.text for segment in dataset for token in segment]
     # Generate vocabs for the tokens
-    segment_vocab = vocab(Counter(tokens), specials=["UNK"])
+    segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
     return dataset, segment_vocab
@@ -121,4 +133,4 @@ def get_dataloaders(
         logger.info("%s batches found", len(dataloader))
         dataloaders.append(dataloader)
-    return dataloaders
+    return dataloaders

sinatools-0.1.39/sinatools/ner/helpers.py ADDED Viewed

@@ -0,0 +1,117 @@
+import os
+import sys
+import logging
+import importlib
+import shutil
+import torch
+import pickle
+import json
+import random
+import numpy as np
+from argparse import Namespace
+def logging_config(log_file=None):
+    """
+    Initialize custom logger
+    :param log_file: str - path to log file, full path
+    :return: None
+    """
+    handlers = [logging.StreamHandler(sys.stdout)]
+    if log_file:
+        handlers.append(logging.FileHandler(log_file, "w", "utf-8"))
+        print("Logging to {}".format(log_file))
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=handlers,
+        format="%(levelname)s\t%(name)s\t%(asctime)s\t%(message)s",
+        datefmt="%a, %d %b %Y %H:%M:%S",
+        force=True
+    )
+def load_object(name, kwargs):
+    """
+    Load objects dynamically given the object name and its arguments
+    :param name: str - object name, class name or function name
+    :param kwargs: dict - keyword arguments
+    :return: object
+    """
+    object_module, object_name = name.rsplit(".", 1)
+    object_module = importlib.import_module(object_module)
+    fn = getattr(object_module, object_name)(**kwargs)
+    return fn
+def make_output_dirs(path, subdirs=[], overwrite=True):
+    """
+    Create root directory and any other sub-directories
+    :param path: str - root directory
+    :param subdirs: List[str] - list of sub-directories
+    :param overwrite: boolean - to overwrite the directory or not
+    :return: None
+    """
+    if overwrite:
+        shutil.rmtree(path, ignore_errors=True)
+    os.makedirs(path)
+    for subdir in subdirs:
+        os.makedirs(os.path.join(path, subdir))
+def load_checkpoint(model_path):
+    """
+    Load model given the model path
+    :param model_path: str - path to model
+    :return: tagger - arabiner.trainers.BaseTrainer - the tagger model
+             vocab - arabicner.utils.data.Vocab - indexed tags
+             train_config - argparse.Namespace - training configurations
+    """
+    with open(os.path.join(model_path, "tag_vocab.pkl"), "rb") as fh:
+        tag_vocab = pickle.load(fh)
+    # Load train configurations from checkpoint
+    train_config = Namespace()
+    with open(os.path.join(model_path, "args.json"), "r") as fh:
+        train_config.__dict__ = json.load(fh)
+    # Initialize the loss function, not used for inference, but evaluation
+    loss = load_object(train_config.loss["fn"], train_config.loss["kwargs"])
+    # Load BERT tagger
+    model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
+    model = torch.nn.DataParallel(model)
+    if torch.cuda.is_available():
+        model = model.cuda()
+    # Update arguments for the tagger
+    # Attach the model, loss (used for evaluations cases)
+    train_config.trainer_config["kwargs"]["model"] = model
+    train_config.trainer_config["kwargs"]["loss"] = loss
+    tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
+    tagger.load(os.path.join(model_path, "checkpoints"))
+    return tagger, tag_vocab, train_config
+def set_seed(seed):
+    """
+    Set the seed for random intialization and set
+    CUDANN parameters to ensure determmihstic results across
+    multiple runs with the same seed
+    :param seed: int
+    """
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.enabled = False

{sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/trainers/BaseTrainer.py RENAMED Viewed

@@ -113,5 +113,5 @@ class BaseTrainer:
         logger.info("Loading checkpoint %s", checkpoint_path)
         device = None if torch.cuda.is_available() else torch.device('cpu')
-        checkpoint = torch.load(checkpoint_path, map_location=device)
-        self.model.load_state_dict(checkpoint["model"], strict=False)
+        checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+        self.model.load_state_dict(checkpoint["model"], strict=False)

SinaTools 0.1.38__tar.gz → 0.1.39__tar.gz

SinaTools 0.1.38tar.gz → 0.1.39tar.gz