PyPI - datamule - Versions diffs - 2.2.1__py3-none-any.whl → 2.2.3__py3-none-any.whl - Mend

datamule 2.2.1py3-none-any.whl → 2.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

datamule/document/document.py +192 -61
datamule/sentiment/__init__.py +0 -0
datamule/tags/config.py +9 -1
datamule/{utils → tags}/dictionaries.py +48 -7
datamule/tags/utils.py +28 -1
{datamule-2.2.1.dist-info → datamule-2.2.3.dist-info}/METADATA +1 -1
{datamule-2.2.1.dist-info → datamule-2.2.3.dist-info}/RECORD +9 -8
{datamule-2.2.1.dist-info → datamule-2.2.3.dist-info}/WHEEL +0 -0
{datamule-2.2.1.dist-info → datamule-2.2.3.dist-info}/top_level.txt +0 -0

datamule/document/document.py CHANGED Viewed

@@ -13,11 +13,47 @@ from pathlib import Path
 import webbrowser
 from secsgml.utils import bytes_to_str
 import tempfile
-import warnings
 from .tables.tables import Tables
-from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
+from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
+class DataWithTags(dict):
+    def __init__(self, data, document):
+        super().__init__(data)
+        self._document = document
+        self._tags = None
+    @property
+    def tags(self):
+        if self._tags is None:
+            self._tags = Tags(self._document, mode='data')  # New fragment-based behavior
+        return self._tags
+    @property
+    def similarity(self):
+        if not hasattr(self, '_similarity'):
+            self._similarity = Similarity(self._document, mode='data')
+        return self._similarity
+class TextWithTags(str):
+    def __new__(cls, content, document):
+        instance = str.__new__(cls, content)
+        instance._document = document
+        instance._tags = None
+        return instance
+    @property
+    def tags(self):
+        if self._tags is None:
+            self._tags = Tags(self._document, mode='text')  # Original behavior
+        return self._tags
+    @property
+    def similarity(self):
+        if not hasattr(self, '_similarity'):
+            self._similarity = Similarity(self._document, mode='text')
+        return self._similarity
 class Tickers:
     def __init__(self, document):
@@ -27,11 +63,7 @@ class Tickers:
     def _get_tickers_data(self):
         """Get all tickers data once and cache it"""
         if self._tickers_data is None:
-            # Check if document extension is supported
-            if self.document.extension not in ['.htm', '.html', '.txt']:
-                self._tickers_data = {}
-            else:
-                self._tickers_data = get_all_tickers(self.document.text)
+           self._tickers_data = get_all_tickers(self.document.text)
         return self._tickers_data
     def __getattr__(self, exchange_name):
@@ -57,14 +89,14 @@ class Tickers:
         data = self._get_tickers_data()
         return str(data)
-class Tags:
-    def __init__(self, document):
+class TextAnalysisBase:
+    def __init__(self, document, mode='text'):
         from ..tags.config import _active_dictionaries,_loaded_dictionaries
-        self.not_supported = document.extension not in ['.htm', '.html', '.txt']
         self.document = document
-        self._tickers = None
+        self.mode = mode  # 'text' or 'data'
         self.dictionaries = {}
         self.processors = {}
+        self._text_sources = None
         # Load global dictionaries with their data and processors
         active_dicts = _active_dictionaries
@@ -73,76 +105,166 @@ class Tags:
             self.dictionaries[dict_name] = dict_info['data']
             if dict_info['processor'] is not None:
                 self.processors[dict_name] = dict_info['processor']
-    def _check_support(self):
-        if self.not_supported:
-            warnings.warn(f"Document extension '{self.document.extension}' is not supported. Supported formats: .htm, .html, .txt")
-            return False
-        return True
+    def _get_text_sources(self):
+        """Get text sources based on mode - either single text or multiple fragments"""
+        if self._text_sources is None:
+            if self.mode == 'text':
+                # Original behavior - single text source
+                self._text_sources = [{'id': None, 'text': str(self.document.text)}]
+            else:  # mode == 'data'
+                # New behavior - multiple text fragments
+                self._text_sources = []
+                self._extract_text_fragments(self.document.data, '')
+        return self._text_sources
+    def _extract_text_fragments(self, data, parent_id=''):
+        """Extract all text fragments with their document IDs from parsed data"""
+        if isinstance(data, dict):
+            for key, value in data.items():
+                if key in ["text", "title"] and isinstance(value, str):
+                    # Use the current dictionary's parent key as the fragment ID
+                    self._text_sources.append({
+                        'id': parent_id,
+                        'text': value
+                    })
+                elif isinstance(value, (dict, list)):
+                    # Pass the current key as the parent_id for the next level
+                    self._extract_text_fragments(value, key)
+        elif isinstance(data, list):
+            for i, item in enumerate(data):
+                if isinstance(item, (dict, list)):
+                    self._extract_text_fragments(item, parent_id)
+    def _format_results(self, results, fragment_id):
+        """Format results based on mode"""
+        if self.mode == 'text':
+            # Original format: (match, start, end)
+            return results
+        else:
+            # New format: (match, fragment_id, start, end)
+            return [(match, fragment_id, start, end) for match, start, end in results]
+class Tags(TextAnalysisBase):
+    def __init__(self, document, mode='text'):
+        super().__init__(document, mode)
+        self._tickers = None
     @property
     def cusips(self):
-        if not self._check_support():
-            return None
+        if not hasattr(self, '_cusips'):
+            self._cusips = []
+            sources = self._get_text_sources()
-        if not hasattr(self, '_cusip'):
-            if 'sc13dg_cusips' in self.dictionaries:
-                keywords = self.dictionaries['sc13dg_cusips']
-                self._cusip = get_cusip_using_regex(self.document.text, keywords)
-            else:
-                self._cusip = get_cusip_using_regex(self.document.text)
-        return self._cusip
+            for source in sources:
+                if 'sc13dg_cusips' in self.dictionaries:
+                    keywords = self.dictionaries['sc13dg_cusips']
+                    results = get_cusip_using_regex(source['text'], keywords)
+                elif "13fhr_information_table_cusips" in self.dictionaries:
+                    keywords = self.dictionaries['13fhr_information_table_cusips']
+                    results = get_cusip_using_regex(source['text'], keywords)
+                else:
+                    results = get_cusip_using_regex(source['text'])
+                # Format results based on mode
+                formatted_results = self._format_results(results, source['id'])
+                self._cusips.extend(formatted_results)
+        return self._cusips
     @property
     def isins(self):
-        if not self._check_support():
-            return None
+        if not hasattr(self, '_isins'):
+            self._isins = []
+            sources = self._get_text_sources()
-        if not hasattr(self, '_isin'):
-            if 'npx_isins' in self.dictionaries:
-                keywords = self.dictionaries['npx_isins']
-                self._isin = get_isin_using_regex(self.document.text, keywords)
-            else:
-                self._isin = get_isin_using_regex(self.document.text)
-        return self._isin
+            for source in sources:
+                if 'npx_isins' in self.dictionaries:
+                    keywords = self.dictionaries['npx_isins']
+                    results = get_isin_using_regex(source['text'], keywords)
+                else:
+                    results = get_isin_using_regex(source['text'])
+                formatted_results = self._format_results(results, source['id'])
+                self._isins.extend(formatted_results)
+        return self._isins
     @property
     def figis(self):
-        if not self._check_support():
-            return None
+        if not hasattr(self, '_figis'):
+            self._figis = []
+            sources = self._get_text_sources()
-        if not hasattr(self, '_figi'):
-            if 'npx_figis' in self.dictionaries:
-                keywords = self.dictionaries['npx_figis']
-                self._figi = get_figi_using_regex(self.document.text, keywords)
-            else:
-                self._figi = get_figi_using_regex(self.document.text)
-        return self._figi
+            for source in sources:
+                if 'npx_figis' in self.dictionaries:
+                    keywords = self.dictionaries['npx_figis']
+                    results = get_figi_using_regex(source['text'], keywords)
+                else:
+                    results = get_figi_using_regex(source['text'])
+                formatted_results = self._format_results(results, source['id'])
+                self._figis.extend(formatted_results)
+        return self._figis
     @property
     def tickers(self):
+        # Tickers work differently - they need the full document context
+        # Keep original behavior for now
         if self._tickers is None:
             self._tickers = Tickers(self.document)
         return self._tickers
     @property
     def persons(self):
-        if not self._check_support():
-            return None
         if not hasattr(self, '_persons'):
-            if '8k_2024_persons' in self.processors:
-                # Use pre-built processor
-                self._persons = get_full_names_dictionary_lookup(self.document.text, self.processors['8k_2024_persons'])
-            elif 'ssa_baby_first_names' in self.dictionaries:
-                # Use regex with SSA names for validation
-                self._persons = get_full_names(self.document.text, self.dictionaries['ssa_baby_first_names'])
-            else:
-                # Fallback to regex without validation
-                self._persons = get_full_names(self.document.text)
+            self._persons = []
+            sources = self._get_text_sources()
+            for source in sources:
+                if '8k_2024_persons' in self.processors:
+                    results = get_full_names_dictionary_lookup(source['text'], self.processors['8k_2024_persons'])
+                elif 'ssa_baby_first_names' in self.dictionaries:
+                    results = get_full_names(source['text'], self.dictionaries['ssa_baby_first_names'])
+                else:
+                    results = get_full_names(source['text'])
+                formatted_results = self._format_results(results, source['id'])
+                self._persons.extend(formatted_results)
         return self._persons
+class Similarity(TextAnalysisBase):
+    @property
+    def loughran_mcdonald(self):
+        if not hasattr(self, '_loughran_mcdonald'):
+            self._loughran_mcdonald = []
+            sources = self._get_text_sources()
+            if 'loughran_mcdonald' in self.processors:
+                lm_processors = self.processors['loughran_mcdonald']
+                for source in sources:
+                    results = analyze_lm_sentiment_fragment(source['text'], lm_processors)
+                    if self.mode == 'text':
+                        # Single result for whole document
+                        self._loughran_mcdonald = results
+                        break
+                    else:
+                        # Per-fragment results with fragment_id
+                        fragment_result = {
+                            'fragment_id': source['id'],
+                            **results
+                        }
+                        self._loughran_mcdonald.append(fragment_result)
+            else:
+                # No processors available
+                self._loughran_mcdonald = [] if self.mode == 'data' else {}
+        return self._loughran_mcdonald
 class Document:
     def __init__(self, type, content, extension,accession,filing_date,path=None):
@@ -168,8 +290,6 @@ class Document:
         self._tables = None
         self._text = None
-        self.tags = Tags(self)
     #_load_text_content
@@ -354,15 +474,26 @@ class Document:
     def data(self):
         if self._data is None:
             self.parse()
+        if self._data is None:
+            self._data = {}
+        if not isinstance(self._data, DataWithTags):
+            self._data = DataWithTags(self._data, self)
         return self._data
     @property
     def text(self):
         if self._text is None:
             if self.extension in ['.htm','.html']:
-                self._preprocess_html_content()
+                self._preprocess_html_content()  # Still sets self._text to plain string
             elif self.extension == '.txt':
-                self._preprocess_txt_content()
+                self._preprocess_txt_content()   # Still sets self._text to plain string
+            # Convert the plain string to TextWithTags
+            plain_text = self._text
+            self._text = TextWithTags(plain_text, self)
         return self._text
     def write_json(self, output_filename=None):

datamule/sentiment/__init__.py ADDED Viewed

File without changes

datamule/tags/config.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from ..utils.dictionaries import download_dictionary, load_dictionary
+from .dictionaries import download_dictionary, load_dictionary
 _active_dictionaries = []
 _loaded_dictionaries = {}
@@ -26,6 +26,14 @@ def set_dictionaries(dictionaries, overwrite=False):
                 'data': raw_data,
                 'processor': processor
             }
+        elif dict_name == 'loughran_mcdonald':
+            from .utils import create_lm_processors
+            processors = create_lm_processors(raw_data)
+            _loaded_dictionaries[dict_name] = {
+                'data': raw_data,
+                'processor': processors
+            }
         else:
             _loaded_dictionaries[dict_name] = {
                 'data': raw_data,

datamule/{utils → tags}/dictionaries.py RENAMED Viewed

@@ -1,16 +1,19 @@
 from pathlib import Path
 import urllib.request
 import json
+import csv
 urls = {
     "ssa_baby_first_names": "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/ssa_baby_first_names.txt",
     "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
     "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
     "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
-    "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json"
+    "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
+    "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt",
+    "loughran_mcdonald" : "https://drive.usercontent.google.com/u/0/uc?id=1cfg_w3USlRFS97wo7XQmYnuzhpmzboAY&export=download"
 }
-def download_dictionary(name,overwrite=False):
+def download_dictionary(name, overwrite=False):
     url = urls[name]
     # Create dictionaries directory in datamule folder
@@ -19,13 +22,19 @@ def download_dictionary(name,overwrite=False):
     # check if file exists first
     if not overwrite:
-        filename = url.split('/')[-1]
+        if name == "loughran_mcdonald":
+            filename = "loughran_mcdonald.csv"
+        else:
+            filename = url.split('/')[-1]
         file_path = dict_dir / filename
         if file_path.exists():
             return
     # Extract filename from URL
-    filename = url.split('/')[-1]
+    if name == "loughran_mcdonald":
+        filename = "loughran_mcdonald.csv"
+    else:
+        filename = url.split('/')[-1]
     file_path = dict_dir / filename
     print(f"Downloading {name} dictionary to {file_path}")
@@ -35,7 +44,11 @@ def download_dictionary(name,overwrite=False):
 def load_dictionary(name):
     # Get or download the dictionary file
     dict_dir = Path.home() / ".datamule" / "dictionaries"
-    filename = urls[name].split('/')[-1]
+    if name == "loughran_mcdonald":
+        filename = "loughran_mcdonald.csv"
+    else:
+        filename = urls[name].split('/')[-1]
     file_path = dict_dir / filename
     # Download if doesn't exist
@@ -67,10 +80,38 @@ def load_dictionary(name):
             for line in f:
                 cusip_set.add(line.strip())
         return cusip_set
+    elif name == "13fhr_information_table_cusips":
+        cusip_set = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                cusip_set.add(line.strip())
+        return cusip_set
     elif name == "8k_2024_persons":
         with open(file_path, 'r', encoding='utf-8') as f:
             persons_list = json.load(f)
         return persons_list
+    elif name == "loughran_mcdonald":
+        # Load the Loughran-McDonald dictionary using base Python CSV
+        lm_dict = {}
+        categories = ['Negative', 'Positive', 'Uncertainty', 'Litigious',
+                    'Strong_Modal', 'Weak_Modal', 'Constraining']
+        # Initialize category sets
+        for category in categories:
+            lm_dict[category.lower()] = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                word = row['Word'].lower()
+                for category in categories:
+                    value = row.get(category)
+                    # Check if value exists and is not 0 (words added in specific years)
+                    if value and str(value).strip() != '0':
+                        lm_dict[category.lower()].add(word)
+        return lm_dict
     else:
-        raise ValueError("dictionary not found")
+        raise ValueError("dictionary not found")
+download_dictionary('loughran_mcdonald')

datamule/tags/utils.py CHANGED Viewed

@@ -142,4 +142,31 @@ def get_full_names_dictionary_lookup(text, processor):
     for keyword, start_pos, end_pos in keywords_found:
         matches.append((keyword, start_pos, end_pos))
-    return matches
+    return matches
+def create_lm_processors(lm_dict):
+    processors = {}
+    for category_key, word_set in lm_dict.items():
+        processor = KeywordProcessor(case_sensitive=False)
+        for word in word_set:
+            processor.add_keyword(word)
+        processors[category_key] = processor
+    return processors
+def analyze_lm_sentiment_fragment(text, processors):
+    """Analyze sentiment for a single text fragment"""
+    if not text or not text.strip():
+        return {}
+    word_count = len(text.split())
+    results = {}
+    for category, processor in processors.items():
+        matches = processor.extract_keywords(text.lower(), span_info=True)
+        results[category] = len(matches)
+    results['total_words'] = word_count
+    return results

{datamule-2.2.1.dist-info → datamule-2.2.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.1
+Version: 2.2.3
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.1.dist-info → datamule-2.2.3.dist-info}/RECORD RENAMED Viewed

@@ -15,7 +15,7 @@ datamule/datamule/datamule_mysql_rds.py,sha256=Q6_h24-SNECWK60RnM6UQjUIp5dhJmfn3
 datamule/datamule/downloader.py,sha256=mVg1SApfij_9-dTpcm_YB26Bxc_Yq1FR8xv2k50MHqU,18579
 datamule/datamule/sec_connector.py,sha256=VwOaODpHoAWy8JIky6kLR1-orW_PB61RHw7pIGRpkow,3288
 datamule/document/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/document/document.py,sha256=yiev4AYewjp8bPjWn9cuL43N2O11s9WUo4X2e7WUgiY,20628
+datamule/document/document.py,sha256=oOib-bFPZ0rsIk8WBgBVY73CwuU18MZDmXnAQ8fTVD8,26124
 datamule/document/tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/document/tables/tables.py,sha256=8riSAof6o-Gxoo0SkiQAE61fw8NmzDnEhJe6dATzmvA,4487
 datamule/document/tables/tables_13fhr.py,sha256=-6tWcaTyNsb0XuW0WMBrYir9Zn1wLZL0laKxRYfPNyg,4265
@@ -48,15 +48,16 @@ datamule/sec/xbrl/streamcompanyfacts.py,sha256=Qq88PqW5_j1k3Aqrl0KRmKeF54D6Wbb6H
 datamule/sec/xbrl/xbrlmonitor.py,sha256=TKFVfSyyUUfUgFQw4WxEVs4g8Nh-2C0tygNIRmTqW3Y,5848
 datamule/seclibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/seclibrary/bq.py,sha256=C8sb_rpXTvchprrFLcbRar4Qi0XWW25tnv1YsHSS5o4,18025
+datamule/sentiment/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/tags/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datamule/tags/config.py,sha256=RCYRw_voP2MrEx_iN7zjJiZ8YDa4QlzKPGpW5ZTij6U,1197
+datamule/tags/config.py,sha256=w7386pyvnWYPNwgMVT_Nw5ivXibOeFuSuMEI7lRsGrk,1495
+datamule/tags/dictionaries.py,sha256=1v2OoN1KnM3HbFHxATxe7LhVRoXe64ecRRgA3oak210,4587
 datamule/tags/regex.py,sha256=Zr1dlnb8OfecDkI2DFCI8DUBr9LI50fapQyBAYNEZrg,4487
-datamule/tags/utils.py,sha256=hQpQBVAJPmys1UKVS2mqc8Z5-qO_zma5ecFXvW9DXoo,5329
+datamule/tags/utils.py,sha256=hexmz_3YnoPrC98A5DTz1xa8o58xZ1yKbzQYP1XiQts,6100
 datamule/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamule/utils/construct_submissions_data.py,sha256=NB_hvfxlRXPyt4Fgc-5qA8vJRItkLhBedCSTaxwW7Jg,5887
-datamule/utils/dictionaries.py,sha256=VImvQWlP8IohB76rDd83bZcT184LBOpOaXPOH46fA6Y,2795
 datamule/utils/format_accession.py,sha256=60RtqoNqoT9zSKVb1DeOv1gncJxzPTFMNW4SNOVmC_g,476
-datamule-2.2.1.dist-info/METADATA,sha256=aINGZMWV34SclEt-2Ij2d2848PJA7cLF6ZoBL2LwpfY,585
-datamule-2.2.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
-datamule-2.2.1.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
-datamule-2.2.1.dist-info/RECORD,,
+datamule-2.2.3.dist-info/METADATA,sha256=cca85xqYigHxQbSRJPlOwyJ6pbVp-87YYk0wUBXcMr8,585
+datamule-2.2.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
+datamule-2.2.3.dist-info/top_level.txt,sha256=iOfgmtSMFVyr7JGl_bYSTDry79JbmsG4p8zKq89ktKk,9
+datamule-2.2.3.dist-info/RECORD,,

{datamule-2.2.1.dist-info → datamule-2.2.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{datamule-2.2.1.dist-info → datamule-2.2.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamule 2.2.1__py3-none-any.whl → 2.2.3__py3-none-any.whl

datamule 2.2.1py3-none-any.whl → 2.2.3py3-none-any.whl