PyPI - datamule - Versions diffs - 2.2.2__tar.gz → 2.2.4__tar.gz - Mend

datamule 2.2.2tar.gz → 2.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

{datamule-2.2.2 → datamule-2.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.2
+Version: 2.2.4
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.2 → datamule-2.2.4}/datamule/datamule/downloader.py RENAMED Viewed

@@ -287,7 +287,7 @@ class Downloader:
                     keepalive_timeout=60
                 )
-                async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=30)) as session:
+                async with aiohttp.ClientSession(connector=connector, timeout=aiohttp.ClientTimeout(total=600)) as session:
                     tasks = [
                         self.download_and_process(
                             session, url, semaphore, decompression_pool,

{datamule-2.2.2 → datamule-2.2.4}/datamule/document/document.py RENAMED Viewed

@@ -15,7 +15,7 @@ from secsgml.utils import bytes_to_str
 import tempfile
 from .tables.tables import Tables
-from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup
+from ..tags.utils import get_cusip_using_regex, get_isin_using_regex, get_figi_using_regex,get_all_tickers, get_full_names,get_full_names_dictionary_lookup, analyze_lm_sentiment_fragment
 class DataWithTags(dict):
     def __init__(self, data, document):
@@ -29,6 +29,12 @@ class DataWithTags(dict):
             self._tags = Tags(self._document, mode='data')  # New fragment-based behavior
         return self._tags
+    @property
+    def similarity(self):
+        if not hasattr(self, '_similarity'):
+            self._similarity = Similarity(self._document, mode='data')
+        return self._similarity
 class TextWithTags(str):
     def __new__(cls, content, document):
         instance = str.__new__(cls, content)
@@ -42,6 +48,12 @@ class TextWithTags(str):
             self._tags = Tags(self._document, mode='text')  # Original behavior
         return self._tags
+    @property
+    def similarity(self):
+        if not hasattr(self, '_similarity'):
+            self._similarity = Similarity(self._document, mode='text')
+        return self._similarity
 class Tickers:
     def __init__(self, document):
@@ -77,12 +89,11 @@ class Tickers:
         data = self._get_tickers_data()
         return str(data)
-class Tags:
+class TextAnalysisBase:
     def __init__(self, document, mode='text'):
         from ..tags.config import _active_dictionaries,_loaded_dictionaries
         self.document = document
         self.mode = mode  # 'text' or 'data'
-        self._tickers = None
         self.dictionaries = {}
         self.processors = {}
         self._text_sources = None
@@ -133,6 +144,11 @@ class Tags:
         else:
             # New format: (match, fragment_id, start, end)
             return [(match, fragment_id, start, end) for match, start, end in results]
+class Tags(TextAnalysisBase):
+    def __init__(self, document, mode='text'):
+        super().__init__(document, mode)
+        self._tickers = None
     @property
     def cusips(self):
@@ -218,7 +234,38 @@ class Tags:
                 self._persons.extend(formatted_results)
         return self._persons
+class Similarity(TextAnalysisBase):
+    @property
+    def loughran_mcdonald(self):
+        if not hasattr(self, '_loughran_mcdonald'):
+            self._loughran_mcdonald = []
+            sources = self._get_text_sources()
+            if 'loughran_mcdonald' in self.processors:
+                lm_processors = self.processors['loughran_mcdonald']
+                for source in sources:
+                    results = analyze_lm_sentiment_fragment(source['text'], lm_processors)
+                    if self.mode == 'text':
+                        # Single result for whole document
+                        self._loughran_mcdonald = results
+                        break
+                    else:
+                        # Per-fragment results with fragment_id
+                        fragment_result = {
+                            'fragment_id': source['id'],
+                            **results
+                        }
+                        self._loughran_mcdonald.append(fragment_result)
+            else:
+                # No processors available
+                self._loughran_mcdonald = [] if self.mode == 'data' else {}
+        return self._loughran_mcdonald
 class Document:
     def __init__(self, type, content, extension,accession,filing_date,path=None):

{datamule-2.2.2 → datamule-2.2.4}/datamule/tags/config.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from ..utils.dictionaries import download_dictionary, load_dictionary
+from .dictionaries import download_dictionary, load_dictionary
 _active_dictionaries = []
 _loaded_dictionaries = {}
@@ -26,6 +26,14 @@ def set_dictionaries(dictionaries, overwrite=False):
                 'data': raw_data,
                 'processor': processor
             }
+        elif dict_name == 'loughran_mcdonald':
+            from .utils import create_lm_processors
+            processors = create_lm_processors(raw_data)
+            _loaded_dictionaries[dict_name] = {
+                'data': raw_data,
+                'processor': processors
+            }
         else:
             _loaded_dictionaries[dict_name] = {
                 'data': raw_data,

{datamule-2.2.2/datamule/utils → datamule-2.2.4/datamule/tags}/dictionaries.py RENAMED Viewed

@@ -1,17 +1,19 @@
 from pathlib import Path
 import urllib.request
 import json
+import csv
 urls = {
     "ssa_baby_first_names": "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/ssa_baby_first_names.txt",
     "npx_figis" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_figis.txt",
     "npx_isins" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/npx_isins.txt",
     "sc13dg_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/sc13dg_cusips.txt",
     "8k_2024_persons" : "https://raw.githubusercontent.com/john-friedman/datamule-data/master/data/dictionaries/8k_2024_persons.json",
-    "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt"
+    "13fhr_information_table_cusips" : "https://raw.githubusercontent.com/john-friedman/datamule-data/refs/heads/master/data/dictionaries/13fhr_information_table_cusips.txt",
+    "loughran_mcdonald" : "https://drive.usercontent.google.com/u/0/uc?id=1cfg_w3USlRFS97wo7XQmYnuzhpmzboAY&export=download"
 }
-def download_dictionary(name,overwrite=False):
+def download_dictionary(name, overwrite=False):
     url = urls[name]
     # Create dictionaries directory in datamule folder
@@ -20,13 +22,19 @@ def download_dictionary(name,overwrite=False):
     # check if file exists first
     if not overwrite:
-        filename = url.split('/')[-1]
+        if name == "loughran_mcdonald":
+            filename = "loughran_mcdonald.csv"
+        else:
+            filename = url.split('/')[-1]
         file_path = dict_dir / filename
         if file_path.exists():
             return
     # Extract filename from URL
-    filename = url.split('/')[-1]
+    if name == "loughran_mcdonald":
+        filename = "loughran_mcdonald.csv"
+    else:
+        filename = url.split('/')[-1]
     file_path = dict_dir / filename
     print(f"Downloading {name} dictionary to {file_path}")
@@ -36,7 +44,11 @@ def download_dictionary(name,overwrite=False):
 def load_dictionary(name):
     # Get or download the dictionary file
     dict_dir = Path.home() / ".datamule" / "dictionaries"
-    filename = urls[name].split('/')[-1]
+    if name == "loughran_mcdonald":
+        filename = "loughran_mcdonald.csv"
+    else:
+        filename = urls[name].split('/')[-1]
     file_path = dict_dir / filename
     # Download if doesn't exist
@@ -75,9 +87,31 @@ def load_dictionary(name):
                 cusip_set.add(line.strip())
         return cusip_set
     elif name == "8k_2024_persons":
         with open(file_path, 'r', encoding='utf-8') as f:
             persons_list = json.load(f)
         return persons_list
+    elif name == "loughran_mcdonald":
+        # Load the Loughran-McDonald dictionary using base Python CSV
+        lm_dict = {}
+        categories = ['Negative', 'Positive', 'Uncertainty', 'Litigious',
+                    'Strong_Modal', 'Weak_Modal', 'Constraining']
+        # Initialize category sets
+        for category in categories:
+            lm_dict[category.lower()] = set()
+        with open(file_path, 'r', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                word = row['Word'].lower()
+                for category in categories:
+                    value = row.get(category)
+                    # Check if value exists and is not 0 (words added in specific years)
+                    if value and str(value).strip() != '0':
+                        lm_dict[category.lower()].add(word)
+        return lm_dict
     else:
-        raise ValueError("dictionary not found")
+        raise ValueError("dictionary not found")
+download_dictionary('loughran_mcdonald')

{datamule-2.2.2 → datamule-2.2.4}/datamule/tags/utils.py RENAMED Viewed

@@ -142,4 +142,31 @@ def get_full_names_dictionary_lookup(text, processor):
     for keyword, start_pos, end_pos in keywords_found:
         matches.append((keyword, start_pos, end_pos))
-    return matches
+    return matches
+def create_lm_processors(lm_dict):
+    processors = {}
+    for category_key, word_set in lm_dict.items():
+        processor = KeywordProcessor(case_sensitive=False)
+        for word in word_set:
+            processor.add_keyword(word)
+        processors[category_key] = processor
+    return processors
+def analyze_lm_sentiment_fragment(text, processors):
+    """Analyze sentiment for a single text fragment"""
+    if not text or not text.strip():
+        return {}
+    word_count = len(text.split())
+    results = {}
+    for category, processor in processors.items():
+        matches = processor.extract_keywords(text.lower(), span_info=True)
+        results[category] = len(matches)
+    results['total_words'] = word_count
+    return results

datamule-2.2.4/datamule/utils/__init__.py ADDED Viewed

File without changes

{datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamule
-Version: 2.2.2
+Version: 2.2.4
 Summary: Work with SEC submissions at scale.
 Home-page: https://github.com/john-friedman/datamule-python
 Author: John Friedman

{datamule-2.2.2 → datamule-2.2.4}/datamule.egg-info/SOURCES.txt RENAMED Viewed

@@ -54,11 +54,12 @@ datamule/sec/xbrl/streamcompanyfacts.py
 datamule/sec/xbrl/xbrlmonitor.py
 datamule/seclibrary/__init__.py
 datamule/seclibrary/bq.py
+datamule/sentiment/__init__.py
 datamule/tags/__init__.py
 datamule/tags/config.py
+datamule/tags/dictionaries.py
 datamule/tags/regex.py
 datamule/tags/utils.py
 datamule/utils/__init__.py
 datamule/utils/construct_submissions_data.py
-datamule/utils/dictionaries.py
 datamule/utils/format_accession.py

{datamule-2.2.2 → datamule-2.2.4}/setup.py RENAMED Viewed

@@ -32,7 +32,7 @@ if not os.path.exists(file_path):
 setup(
     name="datamule",
     author="John Friedman",
-    version="2.2.2",
+    version="2.2.4",
     description="Work with SEC submissions at scale.",
     packages=find_packages(include=['datamule', 'datamule.*']),
     url="https://github.com/john-friedman/datamule-python",