PyPI - datamule - Versions diffs - 2.1.6__tar.gz → 2.2.1__tar.gz - Mend

datamule 2.1.6tar.gz → 2.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

{datamule-2.1.6 → datamule-2.2.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.1.6
+Version: 2.2.1
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -19,3 +19,4 @@ Requires-Dist: secxbrl
 Requires-Dist: secsgml
 Requires-Dist: websocket-client
 Requires-Dist: company_fundamentals
+Requires-Dist: flashtext

{datamule-2.1.6 → datamule-2.2.1}/datamule/document/document.py RENAMED Viewed

@@ -13,9 +13,137 @@ from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
 import tempfile
+import warnings
 from .tables.tables import Tables
+from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
+class Tickers:
+    def __init__(self, document):
+        self.document = document
+        self._tickers_data = None
+    def _get_tickers_data(self):
+        """Get all tickers data once and cache it"""
+        if self._tickers_data is None:
+            # Check if document extension is supported
+            if self.document.extension not in ['.htm', '.html', '.txt']:
+                self._tickers_data = {}
+            else:
+                self._tickers_data = get_all_tickers(self.document.text)
+        return self._tickers_data
+    def __getattr__(self, exchange_name):
+        data = self._get_tickers_data()
+        if exchange_name in data:
+            return data[exchange_name]
+        return []
+    def __bool__(self):
+        """Return True if any tickers were found"""
+        data = self._get_tickers_data()
+        return bool(data.get('all', []))
+    def __repr__(self):
+        """Show the full ticker data when printed or accessed directly"""
+        data = self._get_tickers_data()
+        return str(data)
+    def __str__(self):
+        """Show the full ticker data when printed"""
+        data = self._get_tickers_data()
+        return str(data)
+class Tags:
+    def __init__(self, document):
+        from ..tags.config import _active_dictionaries,_loaded_dictionaries
+        self.not_supported = document.extension not in ['.htm', '.html', '.txt']
+        self.document = document
+        self._tickers = None
+        self.dictionaries = {}
+        self.processors = {}
+        # Load global dictionaries with their data and processors
+        active_dicts = _active_dictionaries
+        for dict_name in active_dicts:
+            dict_info = _loaded_dictionaries[dict_name]
+            self.dictionaries[dict_name] = dict_info['data']
+            if dict_info['processor'] is not None:
+                self.processors[dict_name] = dict_info['processor']
+    def _check_support(self):
+        if self.not_supported:
+            warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
+            return False
+        return True
+    @property
+    def cusips(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_cusip'):
+            if 'sc13dg_cusips' in self.dictionaries:
+                keywords = self.dictionaries['sc13dg_cusips']
+                self._cusip = get_cusip_using_regex(self.document.text, keywords)
+            else:
+                self._cusip = get_cusip_using_regex(self.document.text)
+        return self._cusip
+    @property
+    def isins(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_isin'):
+            if 'npx_isins' in self.dictionaries:
+                keywords = self.dictionaries['npx_isins']
+                self._isin = get_isin_using_regex(self.document.text, keywords)
+            else:
+                self._isin = get_isin_using_regex(self.document.text)
+        return self._isin
+    @property
+    def figis(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_figi'):
+            if 'npx_figis' in self.dictionaries:
+                keywords = self.dictionaries['npx_figis']
+                self._figi = get_figi_using_regex(self.document.text, keywords)
+            else:
+                self._figi = get_figi_using_regex(self.document.text)
+        return self._figi
+    @property
+    def tickers(self):
+        if self._tickers is None:
+            self._tickers = Tickers(self.document)
+        return self._tickers
+    @property
+    def persons(self):
+        if not self._check_support():
+            return None
+        if not hasattr(self, '_persons'):
+            if '8k_2024_persons' in self.processors:
+                # Use pre-built processor
+                self._persons = get_full_names_dictionary_lookup(self.document.text, self.processors['8k_2024_persons'])
+            elif 'ssa_baby_first_names' in self.dictionaries:
+                # Use regex with SSA names for validation
+                self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
+            else:
+                # Fallback to regex without validation
+                self._persons = get_full_names(self.document.text)
+        return self._persons
 class Document:
     def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -34,10 +162,13 @@ class Document:
             self.path = path
         self.extension = extension
         # this will be filled by parsed
         self._data = None
         self._tables = None
         self._text = None
+        self.tags = Tags(self)

datamule-2.2.1/datamule/tags/config.py ADDED Viewed

@@ -0,0 +1,33 @@
+from ..utils.dictionaries import download_dictionary, load_dictionary
+_active_dictionaries = []
+_loaded_dictionaries = {}
+def set_dictionaries(dictionaries, overwrite=False):
+    """Set active dictionaries and load them into memory"""
+    global _active_dictionaries, _loaded_dictionaries
+    _active_dictionaries = dictionaries
+    _loaded_dictionaries = {}
+    for dict_name in dictionaries:
+        # Download if needed
+        download_dictionary(dict_name, overwrite=overwrite)
+        # Load raw data
+        raw_data = load_dictionary(dict_name)
+        # Create processor for dictionary lookup methods
+        if dict_name in ['8k_2024_persons']:  # Add other dict names as needed
+            from flashtext import KeywordProcessor
+            processor = KeywordProcessor(case_sensitive=True)
+            for key in raw_data.keys():
+                processor.add_keyword(key, key)
+            _loaded_dictionaries[dict_name] = {
+                'data': raw_data,
+                'processor': processor
+            }
+        else:
+            _loaded_dictionaries[dict_name] = {
+                'data': raw_data,
+                'processor': None
+            }

datamule-2.2.1/datamule/tags/regex.py ADDED Viewed

@@ -0,0 +1,105 @@
+# Exchange ticker regexes with word boundaries
+nyse_regex = r"\b([A-Z]{1,4})(\.[A-Z]+)?\b"
+nasdaq_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+nyse_american_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+london_stock_exchange_regex = r"\b([A-Z]{3,4})(\.[A-Z]+)?\b"
+toronto_stock_exchange_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+euronext_paris_regex = r"\b([A-Z]{2,12})(\.[A-Z]+)?\b"
+euronext_amsterdam_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+euronext_brussels_regex = r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"
+euronext_lisbon_regex = r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"
+euronext_milan_regex = r"\b([A-Z]{2,5})(\.[A-Z]+)?\b"
+deutsche_borse_xetra_regex = r"\b([A-Z0-9]{3,6})(\.[A-Z]+)?\b"
+six_swiss_exchange_regex = r"\b([A-Z]{2,6})(\.[A-Z]+)?\b"
+tokyo_stock_exchange_regex = r"\b(\d{4})\b"
+hong_kong_stock_exchange_regex = r"\b(\d{4,5})\b"
+shanghai_stock_exchange_regex = r"\b(6\d{5})\b"
+shenzhen_stock_exchange_regex = r"\b([03]\d{5})\b"
+australian_securities_exchange_regex = r"\b([A-Z]{3})(\.[A-Z]+)?\b"
+singapore_exchange_regex = r"\b([A-Z]\d{2}[A-Z]?)(\.[A-Z]+)?\b"
+nse_bse_regex = r"\b([A-Z&]{1,10})(\.[A-Z]+)?\b"
+sao_paulo_b3_regex = r"\b([A-Z]{4}\d{1,2})(\.[A-Z]+)?\b"
+mexico_bmv_regex = r"\b([A-Z*]{1,7})(\.[A-Z]+)?\b"
+korea_exchange_regex = r"\b(\d{6})\b"
+taiwan_stock_exchange_regex = r"\b(\d{4})\b"
+johannesburg_stock_exchange_regex = r"\b([A-Z]{3})(\.[A-Z]+)?\b"
+tel_aviv_stock_exchange_regex = r"\b([A-Z]{4})(\.[A-Z]+)?\b"
+moscow_exchange_regex = r"\b([A-Z]{4})(\.[A-Z]+)?\b"
+istanbul_stock_exchange_regex = r"\b([A-Z]{5})(\.[A-Z]+)?\b"
+nasdaq_stockholm_regex = r"\b([A-Z]{3,4})( [A-Z])?(\.[A-Z]+)?\b"
+oslo_bors_regex = r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"
+otc_markets_us_regex = r"\b([A-Z]{4,5})[FY]?(\.[A-Z]+)?\b"
+pink_sheets_regex = r"\b([A-Z]{4,5})(\.[A-Z]+)?\b"
+ticker_regex_list = [
+    ("nyse", r"\b([A-Z]{1,4})(\.[A-Z]+)?\b"),
+    ("nasdaq", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("nyse_american", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("london_stock_exchange", r"\b([A-Z]{3,4})(\.[A-Z]+)?\b"),
+    ("toronto_stock_exchange", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("euronext_paris", r"\b([A-Z]{2,12})(\.[A-Z]+)?\b"),
+    ("euronext_amsterdam", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("euronext_brussels", r"\b([A-Z]{1,5})(\.[A-Z]+)?\b"),
+    ("euronext_lisbon", r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"),
+    ("euronext_milan", r"\b([A-Z]{2,5})(\.[A-Z]+)?\b"),
+    ("deutsche_borse_xetra", r"\b([A-Z0-9]{3,6})(\.[A-Z]+)?\b"),
+    ("six_swiss_exchange", r"\b([A-Z]{2,6})(\.[A-Z]+)?\b"),
+    ("tokyo_stock_exchange", r"\b(\d{4})\b"),
+    ("hong_kong_stock_exchange", r"\b(\d{4,5})\b"),
+    ("shanghai_stock_exchange", r"\b(6\d{5})\b"),
+    ("shenzhen_stock_exchange", r"\b([03]\d{5})\b"),
+    ("australian_securities_exchange", r"\b([A-Z]{3})(\.[A-Z]+)?\b"),
+    ("singapore_exchange", r"\b([A-Z]\d{2}[A-Z]?)(\.[A-Z]+)?\b"),
+    ("nse_bse", r"\b([A-Z&]{1,10})(\.[A-Z]+)?\b"),
+    ("sao_paulo_b3", r"\b([A-Z]{4}\d{1,2})(\.[A-Z]+)?\b"),
+    ("mexico_bmv", r"\b([A-Z*]{1,7})(\.[A-Z]+)?\b"),
+    ("korea_exchange", r"\b(\d{6})\b"),
+    ("taiwan_stock_exchange", r"\b(\d{4})\b"),
+    ("johannesburg_stock_exchange", r"\b([A-Z]{3})(\.[A-Z]+)?\b"),
+    ("tel_aviv_stock_exchange", r"\b([A-Z]{4})(\.[A-Z]+)?\b"),
+    ("moscow_exchange", r"\b([A-Z]{4})(\.[A-Z]+)?\b"),
+    ("istanbul_stock_exchange", r"\b([A-Z]{5})(\.[A-Z]+)?\b"),
+    ("nasdaq_stockholm", r"\b([A-Z]{3,4})( [A-Z])?(\.[A-Z]+)?\b"),
+    ("oslo_bors", r"\b([A-Z]{3,5})(\.[A-Z]+)?\b"),
+    ("otc_markets_us", r"\b([A-Z]{4,5})[FY]?(\.[A-Z]+)?\b"),
+    ("pink_sheets", r"\b([A-Z]{4,5})(\.[A-Z]+)?\b"),
+]
+# Security identifier regexes with word boundaries
+cusip_regex = r"\b[0-9A-Z]{8}[0-9]\b"
+isin_regex = r"\b[A-Z]{2}[0-9A-Z]{9}[0-9]\b"
+figi_regex = r"\b[A-Z]{2}G[A-Z0-9]{8}[0-9]\b"
+particles = {
+   # Dutch - single words only
+   'van', 'der', 'den', 'de',
+   # German - single words only
+   'von', 'zu', 'vom', 'zur', 'zum',
+   # Spanish - single words only
+   'de', 'del', 'y',
+   # Portuguese - single words only
+   'da', 'das', 'do', 'dos', 'e',
+   # French - single words only
+   'de', 'du', 'des', 'le', 'la', 'les', "d'",
+   # Italian - single words only
+   'da', 'di', 'del', 'della', 'delle', 'dei', 'degli', 'dello',
+   # Irish/Scottish
+   'mac', 'mc', 'o',
+   # Arabic
+   'al', 'el', 'ibn', 'bin', 'bint', 'abu',
+   # Other European
+   'af', 'av',  # Scandinavian
+   'ter',       # Dutch/Flemish
+   'op',        # Dutch
+   'aan',       # Dutch
+   'ten',       # Dutch
+   'het',       # Dutch
+   'in',        # Dutch
+}

datamule-2.2.1/datamule/tags/utils.py ADDED Viewed

@@ -0,0 +1,145 @@
+import re
+from .regex import cusip_regex, isin_regex, figi_regex, ticker_regex_list
+from .regex import particles
+from flashtext import KeywordProcessor
+def get_cusip_using_regex(text,keywords=None):
+    matches = []
+    for match in re.finditer(cusip_regex, text):
+        if keywords is not None:
+            if match.group() in keywords:
+                matches.append((match.group(), match.start(), match.end()))
+        else:
+            matches.append((match.group(), match.start(), match.end()))
+    return matches
+def get_isin_using_regex(text,keywords=None):
+    matches = []
+    for match in re.finditer(isin_regex, text):
+        if keywords is not None:
+            if match.group() in keywords:
+                matches.append((match.group(), match.start(), match.end()))
+        else:
+            matches.append((match.group(), match.start(), match.end()))
+    return matches
+def get_figi_using_regex(text,keywords=None):
+    matches = []
+    for match in re.finditer(figi_regex, text):
+        if keywords is not None:
+            if match.group() in keywords:
+                matches.append((match.group(), match.start(), match.end()))
+        else:
+            matches.append((match.group(), match.start(), match.end()))
+    return matches
+def get_tickers_using_regex(text, regex_pattern):
+    """Extract tickers using the given regex pattern with position information"""
+    matches = []
+    for match in re.finditer(regex_pattern, text):
+        # Handle tuples from regex groups - take the first capture group
+        if match.groups():
+            ticker = match.group(1) if match.group(1) else match.group(0)
+        else:
+            ticker = match.group(0)
+        matches.append((ticker, match.start(), match.end()))
+    return matches
+def get_all_tickers(text):
+    """Get all tickers from all exchanges organized by exchange with position info"""
+    result = {}
+    all_tickers = []
+    for exchange_name, regex_pattern in ticker_regex_list:
+        tickers = get_tickers_using_regex(text, regex_pattern)
+        result[exchange_name] = tickers
+        all_tickers.extend(tickers)
+    # Remove duplicates while preserving order for 'all'
+    # Keep track of seen ticker values (first element of tuple)
+    seen = set()
+    result['all'] = [x for x in all_tickers if not (x[0] in seen or seen.add(x[0]))]
+    return result
+def get_ticker_regex_dict():
+    """Return ticker regex list as a dictionary for easy lookup"""
+    return dict(ticker_regex_list)
+# will change in future to accomodate other datasets
+def validate_full_name(full_name,keywords):
+    if len(full_name) == 1:
+        return False
+    # check all is upper
+    if all(word.isupper() for word in full_name):
+        return False
+    # check if any number in word
+    if any(any(char.isdigit() for char in word) for word in full_name):
+        return False
+    if any(any(char in ".,;:!?()[]" for char in word) for word in full_name):
+        return False
+    # add optional set lookups
+    if keywords is not None:
+        # return false if first word is not in keywords set
+        if full_name[0] not in keywords:
+            return False
+    return True
+def get_full_names(text,keywords=None):
+    words = text.split()
+    full_names = []
+    current_pos = None
+    word_start_positions = []
+    # Calculate word positions in the original text
+    pos = 0
+    for word in words:
+        start = text.find(word, pos)
+        word_start_positions.append(start)
+        pos = start + len(word)
+    for idx, word in enumerate(words):
+        if current_pos is None:
+            if word[0].isupper():
+                current_pos = idx
+        else:
+            if word[0].isupper() or word.lower() in particles:
+                continue
+            else:
+                full_name = words[current_pos:idx]
+                if validate_full_name(full_name,keywords):
+                    name_text = ' '.join(full_name)
+                    start_pos = word_start_positions[current_pos]
+                    # Calculate end position of the last word in the name
+                    last_word_idx = idx - 1
+                    end_pos = word_start_positions[last_word_idx] + len(words[last_word_idx])
+                    full_names.append((name_text, start_pos, end_pos))
+                current_pos = None
+    # handle last case - if we're still tracking a name when we reach the end
+    if current_pos is not None:
+        full_name = words[current_pos:]
+        if validate_full_name(full_name,keywords):
+            name_text = ' '.join(full_name)
+            start_pos = word_start_positions[current_pos]
+            # Calculate end position of the last word
+            last_word_idx = len(words) - 1
+            end_pos = word_start_positions[last_word_idx] + len(words[last_word_idx])
+            full_names.append((name_text, start_pos, end_pos))
+    return full_names
+# add dictionary lookup based on precomputed lists
+def get_full_names_dictionary_lookup(text, processor):
+    """Use pre-built KeywordProcessor instead of creating new one"""
+    matches = []
+    keywords_found = processor.extract_keywords(text, span_info=True)
+    for keyword, start_pos, end_pos in keywords_found:
+        matches.append((keyword, start_pos, end_pos))
+    return matches

datamule-2.2.1/datamule/utils/__init__.py ADDED Viewed

File without changes

datamule-2.2.1/datamule/utils/dictionaries.py ADDED Viewed

@@ -0,0 +1,76 @@
+from pathlib import Path
+import urllib.request
+import json
+urls = {
+    "ssa_baby_first_names": "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/ssa_baby_first_names.txt",
+    "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
+    "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
+    "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
+    "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
+}
+def download_dictionary(name,overwrite=False):
+    url = urls[name]
+    # Create dictionaries directory in datamule folder
+    dict_dir = Path.home() / ".datamule" / "dictionaries"
+    dict_dir.mkdir(parents=True, exist_ok=True)
+    # check if file exists first
+    if not overwrite:
+        filename = url.split('/')[-1]
+        file_path = dict_dir / filename
+        if file_path.exists():
+            return
+    # Extract filename from URL
+    filename = url.split('/')[-1]
+    file_path = dict_dir / filename
+    print(f"Downloading {name} dictionary to {file_path}")
+    urllib.request.urlretrieve(url, file_path)
+    return
+def load_dictionary(name):
+    # Get or download the dictionary file
+    dict_dir = Path.home() / ".datamule" / "dictionaries"
+    filename = urls[name].split('/')[-1]
+    file_path = dict_dir / filename
+    # Download if doesn't exist
+    if not file_path.exists():
+        download_dictionary(name)
+    # Load the dictionary based on name
+    if name == "ssa_baby_first_names":
+        names_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                names_set.add(line.strip())
+        return names_set
+    elif name == "npx_figis":
+        figi_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                figi_set.add(line.strip())
+        return figi_set
+    elif name == "npx_isins":
+        isin_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                isin_set.add(line.strip())
+        return isin_set
+    elif name == "sc13dg_cusips":
+        cusip_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                cusip_set.add(line.strip())
+        return cusip_set
+    elif name == "8k_2024_persons":
+        with open(file_path, 'r', encoding='utf-8') as f:
+            persons_list = json.load(f)
+        return persons_list
+    else:
+        raise ValueError("dictionary not found")

{datamule-2.1.6 → datamule-2.2.1}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.1.6
+Version: 2.2.1
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman
@@ -19,3 +19,4 @@ Requires-Dist: secxbrl
 Requires-Dist: secsgml
 Requires-Dist: websocket-client
 Requires-Dist: company_fundamentals
+Requires-Dist: flashtext

{datamule-2.1.6 → datamule-2.2.1}/datamule.egg-info/SOURCES.txt RENAMED Viewed

@@ -54,6 +54,11 @@ datamule/sec/xbrl/streamcompanyfacts.py
 datamule/sec/xbrl/xbrlmonitor.py
 datamule/seclibrary/__init__.py
 datamule/seclibrary/bq.py
+datamule/tags/__init__.py
+datamule/tags/config.py
+datamule/tags/regex.py
+datamule/tags/utils.py
 datamule/utils/__init__.py
 datamule/utils/construct_submissions_data.py
+datamule/utils/dictionaries.py
 datamule/utils/format_accession.py

{datamule-2.1.6 → datamule-2.2.1}/datamule.egg-info/requires.txt RENAMED Viewed

@@ -13,3 +13,4 @@ secxbrl
 secsgml
 websocket-client
 company_fundamentals
+flashtext

{datamule-2.1.6 → datamule-2.2.1}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="2.1.6",
+    version="2.2.1",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",
@@ -51,7 +51,8 @@ setup(
         'secxbrl',
         'secsgml',
         'websocket-client',
-        'company_fundamentals'
+        'company_fundamentals',
+        'flashtext'
     ],
     # Include the data directory in the package
     package_data={