PyPI - imbed_data_prep - Versions diffs - 0.1.1__py3-none-any.whl - Mend

imbed_data_prep 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

imbed_data_prep/__init__.py +12 -0
imbed_data_prep/arxiv/README.md +31 -0
imbed_data_prep/arxiv/__init__.py +10 -0
imbed_data_prep/embeddings_of_aggregations/README.md +46 -0
imbed_data_prep/embeddings_of_aggregations/__init__.py +333 -0
imbed_data_prep/embeddings_of_aggregations/embeddings_and_order.ipynb +5573 -0
imbed_data_prep/epstein_files/README.md +182 -0
imbed_data_prep/epstein_files/__init__.py +1061 -0
imbed_data_prep/epstein_files/epstein_files.ipynb +2071 -0
imbed_data_prep/epstein_files/epstein_files_tables_info.json +1 -0
imbed_data_prep/epstein_files/epstein_files_tables_info.pickle +0 -0
imbed_data_prep/eurovis/README.md +52 -0
imbed_data_prep/eurovis/__init__.py +146 -0
imbed_data_prep/eurovis/eurovis.ipynb +3345 -0
imbed_data_prep/github_repos/README.md +45 -0
imbed_data_prep/github_repos/__init__.py +190 -0
imbed_data_prep/github_repos/github_repos.ipynb +840 -0
imbed_data_prep/hcp/README.md +48 -0
imbed_data_prep/hcp/__init__.py +253 -0
imbed_data_prep/hcp/hcp_analysis.ipynb +3886 -0
imbed_data_prep/jersey_laws/README.md +45 -0
imbed_data_prep/jersey_laws/__init__.py +85 -0
imbed_data_prep/jersey_laws/jersey_laws.ipynb +509 -0
imbed_data_prep/lmsys_ai_conversations/README.md +57 -0
imbed_data_prep/lmsys_ai_conversations/__init__.py +786 -0
imbed_data_prep/mcdonalds_reviews/README.md +51 -0
imbed_data_prep/mcdonalds_reviews/__init__.py +463 -0
imbed_data_prep/mcdonalds_reviews/mcdonalds_reviews_dacc.ipynb +1240 -0
imbed_data_prep/prompt_injections/README.md +44 -0
imbed_data_prep/prompt_injections/__init__.py +63 -0
imbed_data_prep/prompt_injections/prompt_injection_w_umap_embeddings.tsv +691 -0
imbed_data_prep/trump_vs_zelenskyy/README.md +60 -0
imbed_data_prep/trump_vs_zelenskyy/__init__.py +569 -0
imbed_data_prep/trump_vs_zelenskyy/trump_vs_zelensky.md +448 -0
imbed_data_prep/trump_vs_zelenskyy/trump_vs_zelenskyy.ipynb +3363 -0
imbed_data_prep/trump_vs_zelenskyy/trump_vs_zelenskyy_embeddings.parquet +0 -0
imbed_data_prep/trump_vs_zelenskyy/trump_vs_zelenskyy_transcript.parquet +0 -0
imbed_data_prep/twitter_sentiment/README.md +47 -0
imbed_data_prep/twitter_sentiment/__init__.py +174 -0
imbed_data_prep/twitter_sentiment/twitter_sentiment.ipynb +616 -0
imbed_data_prep/ultra_chat/README.md +37 -0
imbed_data_prep/ultra_chat/__init__.py +10 -0
imbed_data_prep/ultra_chat/ultra_chat.ipynb +229 -0
imbed_data_prep/wildchat/README.md +54 -0
imbed_data_prep/wildchat/__init__.py +265 -0
imbed_data_prep/wildchat/wildchat.ipynb +7787 -0
imbed_data_prep/wordnet_words/README.md +77 -0
imbed_data_prep/wordnet_words/__init__.py +1212 -0
imbed_data_prep/wordnet_words/test_synset_refactor.ipynb +229 -0
imbed_data_prep/wordnet_words/wordnet_words.ipynb +4341 -0
imbed_data_prep-0.1.1.dist-info/METADATA +41 -0
imbed_data_prep-0.1.1.dist-info/RECORD +54 -0
imbed_data_prep-0.1.1.dist-info/WHEEL +4 -0
imbed_data_prep-0.1.1.dist-info/licenses/LICENSE +21 -0

imbed_data_prep/__init__.py ADDED Viewed

@@ -0,0 +1,12 @@
+"""Modules to acquire and prepare data for the imbed package."""
+# # Use __getitem__ to protect access of the modules of list_of_modules
+# def __getitem__(name):
+#     # if name in list_of_modules:
+#     try:
+#         return globals()[name]
+#     except KeyError:
+#         pass  # will raise ImportError below
+#     raise ImportError(f"No module named {name}")

imbed_data_prep/arxiv/README.md ADDED Viewed

@@ -0,0 +1,31 @@
+# ArXiv
+Data preparation for ArXiv papers.
+## Status
+This module has been migrated to the standalone
+[`xv`](https://pypi.org/project/xv/) package on PyPI.
+The module here is a thin wrapper that re-exports from `xv`.
+## Data source
+[ArXiv](https://arxiv.org/) is an open-access repository of scientific
+papers in physics, mathematics, computer science, and related fields, hosted
+by Cornell University.
+## Usage
+```bash
+pip install xv
+```
+```python
+from imbed_data_prep.arxiv import ...  # delegates to xv
+```
+## Files in this directory
+| File | Description |
+|---|---|
+| `__init__.py` | Wrapper module importing from the `xv` package |

imbed_data_prep/arxiv/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Access to ArXiv data.
+Project moved to xv: https://pypi.org/project/xv/
+"""
+from contextlib import suppress
+with suppress(ImportError):
+    from xv.data_access import *  # pip install xv

imbed_data_prep/embeddings_of_aggregations/README.md ADDED Viewed

@@ -0,0 +1,46 @@
+# Embeddings of Aggregations
+Experiments with aggregated embeddings over citation graphs.
+## Data source
+This module works with citation graph data and academic paper metadata.
+It takes a set of nodes (papers) with embeddings and citation links, then
+explores how aggregating the titles of cited papers and embedding those
+aggregated strings compares to the original embeddings.
+The data is expected to come from an external citation graph (e.g. Semantic
+Scholar, OpenAlex, or a custom corpus) loaded as DataFrames with paper IDs,
+titles, and citation edges.
+## What it does
+1. **Sample nodes** from a citation graph.
+2. **Permute citations** -- for each citing paper, generate multiple random
+   orderings of its cited papers.
+3. **Aggregate titles** -- concatenate cited-paper titles in each permutation
+   order into a single string.
+4. **Embed aggregated strings** -- compute embeddings of these concatenated
+   title strings.
+5. **Compare** -- measure how the aggregated embeddings relate to the
+   original paper embeddings, exploring whether citation context captures
+   similar semantic information.
+## Output
+A DataFrame with columns:
+| Column | Description |
+|---|---|
+| `citing_id` | ID of the citing paper |
+| `n_cited` | Number of papers cited |
+| `permutation_index` | Index of this particular citation ordering |
+| `aggregated_title` | Concatenated cited-paper titles |
+| `embedding` | Embedding vector of the aggregated title string |
+## Files in this directory
+| File | Description |
+|---|---|
+| `__init__.py` | Module code |
+| `embeddings_and_order.ipynb` | Notebook exploring aggregation experiments |

imbed_data_prep/embeddings_of_aggregations/__init__.py ADDED Viewed

@@ -0,0 +1,333 @@
+"""Tools to analyze the embeddings of aggregations"""
+import numpy as np
+from dol import Pipe
+import pandas as pd
+from typing import List, TypeVar, Tuple
+from collections.abc import Mapping, Callable, Iterable
+import oa
+# `simple_semantic_features` was moved/renamed in `imbed`; the current equivalent
+# is `three_text_features` in `imbed.components.vectorization`.
+from imbed.components.vectorization import (
+    three_text_features as simple_semantic_features,
+)
+from imbed.util import fuzzy_induced_graph as fuzzy_induced_graph, Node, Nodes
+# DFLT_EMBEDDING_FUNC = oa.embeddings
+DFLT_EMBEDDING_FUNC = simple_semantic_features
+DFLT_RANDOM_SEED = 0
+def get_n_unique_permutations(arr, n: int, seed: int = DFLT_RANDOM_SEED):
+    """
+    Get n unique permutations of an array, with a random seed fixed, and
+    raise an error if n is larger than the number of possible permutations.
+    Args:
+        arr (list): The list to permute.
+        n (int): The number of unique permutations to generate.
+        seed (int): The random seed for reproducibility.
+    Returns:
+        list: A list of unique permutations.
+    Raises:
+        ValueError: If n is larger than the number of possible permutations.
+    Examples:
+        The exact permutations selected/ordered depend on the RNG, which is
+        not stable across Python/NumPy versions, so these examples are not
+        run as doctests (see test_get_n_unique_permutations for a stable,
+        contract-based check).
+        >>> get_n_unique_permutations([1, 2, 3], 2)  # doctest: +SKIP
+        [(3, 2, 1), (3, 1, 2)]
+        >>> get_n_unique_permutations([1, 2, 3], 2, seed=0)  # doctest: +SKIP
+        [(3, 2, 1), (3, 1, 2)]
+        >>> get_n_unique_permutations([1, 2, 3], 2, seed=1)  # doctest: +SKIP
+        [(2, 3, 1), (1, 3, 2)]
+        >>> get_n_unique_permutations([1, 2, 3], 6)  # doctest: +SKIP
+        [(1, 3, 2), (1, 2, 3), (2, 1, 3), (3, 2, 1), (3, 1, 2), (2, 3, 1)]
+        >>> get_n_unique_permutations([1, 2, 3], 7)
+        Traceback (most recent call last):
+        ...
+        ValueError: n (=7) is larger than the number of possible permutations: 6
+    """
+    import numpy as np
+    import math
+    np.random.seed(seed)
+    n_perms = math.factorial(len(arr))
+    if n > n_perms:
+        raise ValueError(
+            f"n (={n}) is larger than the number of possible permutations: {n_perms}"
+        )
+    perms = set()
+    while len(perms) < n:
+        perms.add(tuple(np.random.permutation(arr)))
+    return list(perms)
+def aggregated_embeddings_for_sample(
+    graph: Mapping[Node, Nodes],
+    n_nodes: int,
+    n_permutations: int,
+    *,
+    node_to_text: Callable[[Node], str],
+    aggregate_texts: callable = "\n\n".join,
+    text_to_embedding: callable = None,
+    max_permutations: int = 100,
+    seed: int = 0,
+):
+    """
+    Get a (without replacement) sample of n_nodes items of the citation_graph
+    and for each, take n_permutations permutations of the cited_ids,
+    aggregate the titles of the cited_ids and compute its embedding.
+    Args:
+        graph (dict): The citation graph.
+        n_nodes (int): The number of nodes to sample.
+        n_permutations (int): The number of permutations to take for each cited_ids list.
+        node_to_text (callable): A function that takes a node (ID) and returns the text to embed.
+        aggregate_texts (callable): A function that takes a list of texts and returns a single aggregated text.
+        text_to_embedding (callable): A function that takes a text and returns an embedding.
+        max_permutations (int): The maximum number of permutations to take.
+        seed (int): The random seed for reproducibility.
+    Returns:
+        An iterable of dicts with aggregated embeddings for each sample.
+    Examples:
+        >>> graph = {
+        ...     'paper1': ['paper2', 'paper3'],
+        ...     'paper2': ['paper3'],
+        ...     'paper3': ['paper1'],
+        ...     'paper4': [],
+        ... }
+        >>> node_titles = {
+        ...     'paper1': 'Title of Paper 1',
+        ...     'paper2': 'Title of Paper 2',
+        ...     'paper3': 'Title of Paper 3',
+        ...     'paper4': 'Title of Paper 4',
+        ... }
+        >>> n_nodes = 2
+        >>> n_permutations = 2
+        >>> list(aggregated_embeddings_for_sample(
+        ...     graph, n_nodes, n_permutations, node_to_text=node_titles.get,
+        ...     text_to_embedding=simple_semantic_features, seed=42
+        ... ))  # doctest: +SKIP
+        [{'citing_id': 'paper2',
+        'permutation_index': 0,
+        'aggregated_title': 'Title of Paper 2\n\nTitle of Paper 3',
+        'embedding': (8, 26, 0)},
+        {'citing_id': 'paper1',
+        'permutation_index': 0,
+        'aggregated_title': 'Title of Paper 1\n\nTitle of Paper 2\n\nTitle of Paper 3',
+        'embedding': (12, 39, 0)},
+        {'citing_id': 'paper1',
+        'permutation_index': 1,
+        'aggregated_title': 'Title of Paper 1\n\nTitle of Paper 3\n\nTitle of Paper 2',
+        'embedding': (12, 39, 0)}]
+    """
+    import random
+    from math import factorial
+    nodes = list(graph.keys())
+    if n_nodes > len(nodes):
+        raise ValueError(
+            f"n_nodes ({n_nodes}) is larger than the number of nodes in the citation_graph ({len(nodes)})"
+        )
+    np.random.seed(seed)
+    sampled_nodes = random.sample(nodes, n_nodes)
+    for citing_id in sampled_nodes:
+        neighbor_nodes = graph[citing_id]
+        if len(neighbor_nodes) == 0:
+            continue  # skip nodes with no citations
+        citing_title = node_to_text(citing_id)
+        n_perms = min(
+            min(n_permutations, max_permutations), factorial(len(neighbor_nodes))
+        )
+        perms = get_n_unique_permutations(neighbor_nodes, n_perms, seed=seed)
+        for idx, perm in enumerate(perms):
+            aggregated_title = aggregate_texts(
+                [citing_title] + [node_to_text(neighbor_node) for neighbor_node in perm]
+            )
+            d = {
+                "citing_id": citing_id,
+                "n_cited": len(neighbor_nodes),
+                "permutation_index": idx,
+                "aggregated_title": aggregated_title,
+            }
+            if text_to_embedding:
+                d["embedding"] = text_to_embedding(aggregated_title)
+            yield d
+get_aggregated_embeddings_for_sample = Pipe(
+    aggregated_embeddings_for_sample, list, pd.DataFrame
+)
+# -------------------------------------------------------------------------------------
+# Tests
+# `simple_semantic_features` was moved/renamed in `imbed`; the current equivalent
+# is `three_text_features` in `imbed.components.vectorization`.
+from imbed.components.vectorization import (
+    three_text_features as simple_semantic_features,
+)
+def test_get_n_unique_permutations():
+    arr = [1, 2, 3]
+    n = 2
+    perms = get_n_unique_permutations(arr, n, seed=0)
+    # Contract-based checks (RNG selection is not stable across versions):
+    assert len(perms) == n, "Wrong number of permutations returned"
+    assert len(set(perms)) == n, "Permutations are not unique"
+    assert all(sorted(p) == sorted(arr) for p in perms), (
+        "Each result must be a permutation of the input array"
+    )
+def test_get_n_unique_permutations_error():
+    arr = [1, 2, 3]
+    n = 7  # There are only 6 possible permutations
+    try:
+        perms = get_n_unique_permutations(arr, n)
+    except ValueError as e:
+        assert str(e) == "n (=7) is larger than the number of possible permutations: 6"
+    else:
+        assert False, "ValueError was not raised when expected"
+from collections.abc import Sequence
+def _is_vector(v):
+    if not isinstance(v, Sequence):
+        return False
+    else:
+        first_element = next(iter(v), None)
+        return isinstance(first_element, (int, float))
+def test_get_aggregated_embeddings_for_sample():
+    citation_graph = {
+        "paper1": ["paper2", "paper3"],
+        "paper2": ["paper3"],
+        "paper3": ["paper1"],
+        "paper4": [],
+    }
+    node_titles = {
+        "paper1": "Title of Paper 1",
+        "paper2": "Title of Paper 2",
+        "paper3": "Title of Paper 3",
+        "paper4": "Title of Paper 4",
+    }
+    n_nodes = 2
+    n_permutations = 2
+    df = get_aggregated_embeddings_for_sample(
+        citation_graph,
+        n_nodes,
+        n_permutations,
+        node_to_text=node_titles.get,
+        text_to_embedding=simple_semantic_features,
+    )
+    assert not df.empty, "DataFrame is empty"
+    assert len(df) <= n_nodes * n_permutations, "DataFrame has more rows than expected"
+    # Check that embeddings are numpy arrays
+    import numpy as np
+    assert all(map(_is_vector, df["embedding"])), "Embeddings are not vectors"
+def test_node_with_no_citations():
+    citation_graph = {
+        "paper1": ["paper2", "paper3"],
+        "paper2": ["paper3"],
+        "paper3": ["paper1"],
+        "paper4": [],
+    }
+    node_titles = {
+        "paper1": "Title of Paper 1",
+        "paper2": "Title of Paper 2",
+        "paper3": "Title of Paper 3",
+        "paper4": "Title of Paper 4",
+    }
+    n_nodes = 4  # All nodes
+    n_permutations = 2
+    df = get_aggregated_embeddings_for_sample(
+        citation_graph,
+        n_nodes,
+        n_permutations,
+        node_to_text=node_titles.get,
+        text_to_embedding=simple_semantic_features,
+    )
+    # Check that 'paper4' (node with no citations) is not in df['citing_id']
+    assert "paper4" not in df["citing_id"].values, (
+        "Node with no citations should be skipped"
+    )
+def test_n_nodes_too_large():
+    citation_graph = {
+        "paper1": ["paper2", "paper3"],
+        "paper2": ["paper3"],
+        "paper3": ["paper1"],
+        "paper4": [],
+    }
+    node_titles = {
+        "paper1": "Title of Paper 1",
+        "paper2": "Title of Paper 2",
+        "paper3": "Title of Paper 3",
+        "paper4": "Title of Paper 4",
+    }
+    n_nodes = 5  # There are only 4 nodes in citation_graph
+    n_permutations = 2
+    try:
+        df = get_aggregated_embeddings_for_sample(
+            citation_graph,
+            n_nodes,
+            n_permutations,
+            node_to_text=node_titles.get,
+            text_to_embedding=simple_semantic_features,
+        )
+    except ValueError as e:
+        assert (
+            "n_nodes (5) is larger than the number of nodes in the citation_graph (4)"
+            in str(e)
+        )
+    else:
+        assert False, "ValueError was not raised when expected"
+def test_all_permutation_tools():
+    test_get_n_unique_permutations()
+    test_get_n_unique_permutations_error()
+    test_get_aggregated_embeddings_for_sample()
+    test_node_with_no_citations()
+    test_n_nodes_too_large()