PyPI - spells-mtg - Versions diffs - 0.6.1__tar.gz → 0.7.1__tar.gz - Mend

spells-mtg 0.6.1tar.gz → 0.7.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of spells-mtg might be problematic. Click here for more details.

Files changed (17) hide show

{spells_mtg-0.6.1 → spells_mtg-0.7.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: spells-mtg
-Version: 0.6.1
+Version: 0.7.1
 Summary: analaysis of 17Lands.com public datasets
 Author-Email: Joel Barnes <oelarnes@gmail.com>
 License: MIT
@@ -75,6 +75,7 @@ Spells is not affiliated with 17Lands. Please review the [Usage Guidelines](http
 - Caches aggregate DataFrames in the local file system automatically for instantaneous reproduction of previous analysis
 - Manages grouping and filtering by built-in and custom columns at the row level
 - Provides 124 explicitly specified, enumerated, documented column definitions
+- Can aggregate over multiple sets at once, even all of them, if you want.
 - Supports "Deck Color Data" aggregations with built-in column definitions.
 - Lets you feed card metrics back in to column definitions to support scientific workflows like MLE
 - Provides a CLI tool `spells [add|refresh|clean|remove|info] [SET]` to download and manage external files
@@ -156,9 +157,9 @@ Spells is not affiliated with 17Lands. Please review the [Usage Guidelines](http
     ...     'deq_base': ColSpec(
     ...         col_type=ColType.AGG,
     ...         expr=(pl.col('gp_wr_excess') + 0.03 * (1 - pl.col('ata')/14).pow(2)) * pl.col('pct_gp'),
-    ...     }
+    ...     )
     ... }
-    >>> spells.summon('DSK', columns=['deq_base', 'color', 'rarity'], filter_spec={'player_cohort': 'Top'}, extensions=ext)
+    >>> spells.summon('DSK', columns=['deq_base'], group_by=["name", "color", "rarity"], filter_spec={'player_cohort': 'Top'}, extensions=ext)
     ...     .filter(pl.col('deq_base').is_finite())
     ...     .filter(pl.col('rarity').is_in(['common', 'uncommon'])
     ...     .sort('deq_base', descending=True)
@@ -304,25 +305,26 @@ summon(
 #### parameters
-- columns: a list of string or `ColName` values to select as non-grouped columns. Valid `ColTypes` are `PICK_SUM`, `NAME_SUM`, `GAME_SUM`, `CARD_ATTR`, and `AGG`. Min/Max/Unique
+- `columns`: a list of string or `ColName` values to select as non-grouped columns. Valid `ColTypes` are `PICK_SUM`, `NAME_SUM`, `GAME_SUM`, and `AGG`. Min/Max/Unique
 aggregations of non-numeric (or numeric) data types are not supported. If `None`, use a set of columns modeled on the commonly used values on 17Lands.com/card_data.
-- group_by: a list of string or `ColName` values to display as grouped columns. Valid `ColTypes` are `GROUP_BY` and `CARD_ATTR`. By default, group by "name" (card name).
+- `group_by`: a list of string or `ColName` values to display as grouped columns. Valid `ColTypes` are `GROUP_BY` and `CARD_ATTR`. By default, group by "name" (card name). For contextual card attrs, include
+ in `group_by`, even when grouping by name.
-- filter_spec: a dictionary specifying a filter, using a small number of paradigms. Columns used must be in each base view ("draft" and "game") that the `columns` and `group_by` columns depend on, so
-`AGG` and `CARD_ATTR` columns are not valid. Functions of card attributes in the base views can be filtered on, see the documentation for `expr` for details. `NAME_SUM` columns are also not supported. Derived columns are supported. No filter is applied by default. Yes, I should rewrite it to use the mongo query language. The specification is best understood with examples:
+- `filter_spec`: a dictionary specifying a filter, using a small number of paradigms. Columns used must be in each base view ("draft" and "game") that the `columns` and `group_by` columns depend on, so
+`AGG` and `CARD_ATTR` columns are not valid. Functions of card attributes in the base views can be filtered on using `card_context`, see the documentation for `expr` for details. `NAME_SUM` columns are also not supported. Derived columns are supported. No filter is applied by default. Yes, I should rewrite it to use the mongo query language. The specification is best understood with examples:
     - `{'player_cohort': 'Top'}` "player_cohort" value equals "Top".
     - `{'lhs': 'player_cohort', 'op': 'in', 'rhs': ['Top', 'Middle']}` "player_cohort" value is either "Top" or "Middle". Supported values for `op` are `<`, `<=`, `>`, `>=`, `!=`, `=`, `in` and `nin`.
     - `{'$and': [{'lhs': 'draft_date', 'op': '>', 'rhs': datetime.date(2024, 10, 7)}, {'rank': 'Mythic'}]}` Drafts after October 7 by Mythic-ranked players. Supported values for query construction keys are `$and`, `$or`, and `$not`.
-- extensions: a dict of `spells.columns.ColSpec` objects, keyed by name, which are appended to the definitions built-in columns described below.
+- `extensions`: a dict of `spells.columns.ColSpec` objects, keyed by name, which are appended to the definitions built-in columns described below.
-- card_context: Typically a Polars DataFrame containing a `"name"` column with one row for each card name in the set, such that any usages of `card_context[name][key]` in column specs reference the column `key`. Typically this will be the output of a call to `summon` requesting cards metrics like `GP_WR`. Can also be a dictionary having the necessary form for the same access pattern.
+- `card_context`: Typically a Polars DataFrame containing a `"name"` column with one row for each card name in the set, such that any usages of `card_context[name][key]` in column specs reference the column `key`. Typically this will be the output of a call to `summon` requesting cards metrics like `GP_WR`. Can also be a dictionary having the necessary form for the same access pattern.
-- set_context: Typically, a dict of abitrary values to use in column definitions, for example, you could provide the quick draft release date and have a column that depended on that.
+- `set_context`: Typically, a dict of abitrary values to use in column definitions, for example, you could provide the quick draft release date and have a column that depended on that. You can also provide a one-row dataframe and access the column values.
-- read_cache/write_cache: Use the local file system to cache and retrieve aggregations to minimize expensive reads of the large datasets. You shouldn't need to touch these arguments unless you are debugging.
+- `read_cache`/`write_cache`: Use the local file system to cache and retrieve aggregations to minimize expensive reads of the large datasets. You shouldn't need to touch these arguments unless you are debugging.
 ### Enums
@@ -356,6 +358,7 @@ summing over groups, and can include polars Expression aggregations. Arbitrarily
     - For `NAME_SUM` columns, `expr` must be a function of `name` which will result in a list of expressions mapped over all card names.
     - `PICK_SUM` columns can also be functions on `name`, in which case the value will be a function of the value of the `PICK` field.
     - `AGG` columns that depend on `NAME_SUM` columns reference the prefix (`cdef.name`) only, since the unpivot has occured prior to selection.
+    - `AGG` columns must not be functions, since they may be applied to the aggregation of several sets' data. (And they shouldn't need this anyway)
     - The possible arguments to `expr`, in addition to `name` when appropriate, are as follows:
         - `names`: An array of all card names in the canonical order.
         - `card_context`: A dictionary keyed by card name which contains card dict objects with all `CARD_ATTR` values, including custom extensions and metric columns passed by the `card_context` argument to `summon`. See example notebooks for more details.

{spells_mtg-0.6.1 → spells_mtg-0.7.1}/README.md RENAMED Viewed

@@ -64,6 +64,7 @@ Spells is not affiliated with 17Lands. Please review the [Usage Guidelines](http
 - Caches aggregate DataFrames in the local file system automatically for instantaneous reproduction of previous analysis
 - Manages grouping and filtering by built-in and custom columns at the row level
 - Provides 124 explicitly specified, enumerated, documented column definitions
+- Can aggregate over multiple sets at once, even all of them, if you want.
 - Supports "Deck Color Data" aggregations with built-in column definitions.
 - Lets you feed card metrics back in to column definitions to support scientific workflows like MLE
 - Provides a CLI tool `spells [add|refresh|clean|remove|info] [SET]` to download and manage external files
@@ -145,9 +146,9 @@ Spells is not affiliated with 17Lands. Please review the [Usage Guidelines](http
     ...     'deq_base': ColSpec(
     ...         col_type=ColType.AGG,
     ...         expr=(pl.col('gp_wr_excess') + 0.03 * (1 - pl.col('ata')/14).pow(2)) * pl.col('pct_gp'),
-    ...     }
+    ...     )
     ... }
-    >>> spells.summon('DSK', columns=['deq_base', 'color', 'rarity'], filter_spec={'player_cohort': 'Top'}, extensions=ext)
+    >>> spells.summon('DSK', columns=['deq_base'], group_by=["name", "color", "rarity"], filter_spec={'player_cohort': 'Top'}, extensions=ext)
     ...     .filter(pl.col('deq_base').is_finite())
     ...     .filter(pl.col('rarity').is_in(['common', 'uncommon'])
     ...     .sort('deq_base', descending=True)
@@ -293,25 +294,26 @@ summon(
 #### parameters
-- columns: a list of string or `ColName` values to select as non-grouped columns. Valid `ColTypes` are `PICK_SUM`, `NAME_SUM`, `GAME_SUM`, `CARD_ATTR`, and `AGG`. Min/Max/Unique
+- `columns`: a list of string or `ColName` values to select as non-grouped columns. Valid `ColTypes` are `PICK_SUM`, `NAME_SUM`, `GAME_SUM`, and `AGG`. Min/Max/Unique
 aggregations of non-numeric (or numeric) data types are not supported. If `None`, use a set of columns modeled on the commonly used values on 17Lands.com/card_data.
-- group_by: a list of string or `ColName` values to display as grouped columns. Valid `ColTypes` are `GROUP_BY` and `CARD_ATTR`. By default, group by "name" (card name).
+- `group_by`: a list of string or `ColName` values to display as grouped columns. Valid `ColTypes` are `GROUP_BY` and `CARD_ATTR`. By default, group by "name" (card name). For contextual card attrs, include
+ in `group_by`, even when grouping by name.
-- filter_spec: a dictionary specifying a filter, using a small number of paradigms. Columns used must be in each base view ("draft" and "game") that the `columns` and `group_by` columns depend on, so
-`AGG` and `CARD_ATTR` columns are not valid. Functions of card attributes in the base views can be filtered on, see the documentation for `expr` for details. `NAME_SUM` columns are also not supported. Derived columns are supported. No filter is applied by default. Yes, I should rewrite it to use the mongo query language. The specification is best understood with examples:
+- `filter_spec`: a dictionary specifying a filter, using a small number of paradigms. Columns used must be in each base view ("draft" and "game") that the `columns` and `group_by` columns depend on, so
+`AGG` and `CARD_ATTR` columns are not valid. Functions of card attributes in the base views can be filtered on using `card_context`, see the documentation for `expr` for details. `NAME_SUM` columns are also not supported. Derived columns are supported. No filter is applied by default. Yes, I should rewrite it to use the mongo query language. The specification is best understood with examples:
     - `{'player_cohort': 'Top'}` "player_cohort" value equals "Top".
     - `{'lhs': 'player_cohort', 'op': 'in', 'rhs': ['Top', 'Middle']}` "player_cohort" value is either "Top" or "Middle". Supported values for `op` are `<`, `<=`, `>`, `>=`, `!=`, `=`, `in` and `nin`.
     - `{'$and': [{'lhs': 'draft_date', 'op': '>', 'rhs': datetime.date(2024, 10, 7)}, {'rank': 'Mythic'}]}` Drafts after October 7 by Mythic-ranked players. Supported values for query construction keys are `$and`, `$or`, and `$not`.
-- extensions: a dict of `spells.columns.ColSpec` objects, keyed by name, which are appended to the definitions built-in columns described below.
+- `extensions`: a dict of `spells.columns.ColSpec` objects, keyed by name, which are appended to the definitions built-in columns described below.
-- card_context: Typically a Polars DataFrame containing a `"name"` column with one row for each card name in the set, such that any usages of `card_context[name][key]` in column specs reference the column `key`. Typically this will be the output of a call to `summon` requesting cards metrics like `GP_WR`. Can also be a dictionary having the necessary form for the same access pattern.
+- `card_context`: Typically a Polars DataFrame containing a `"name"` column with one row for each card name in the set, such that any usages of `card_context[name][key]` in column specs reference the column `key`. Typically this will be the output of a call to `summon` requesting cards metrics like `GP_WR`. Can also be a dictionary having the necessary form for the same access pattern.
-- set_context: Typically, a dict of abitrary values to use in column definitions, for example, you could provide the quick draft release date and have a column that depended on that.
+- `set_context`: Typically, a dict of abitrary values to use in column definitions, for example, you could provide the quick draft release date and have a column that depended on that. You can also provide a one-row dataframe and access the column values.
-- read_cache/write_cache: Use the local file system to cache and retrieve aggregations to minimize expensive reads of the large datasets. You shouldn't need to touch these arguments unless you are debugging.
+- `read_cache`/`write_cache`: Use the local file system to cache and retrieve aggregations to minimize expensive reads of the large datasets. You shouldn't need to touch these arguments unless you are debugging.
 ### Enums
@@ -345,6 +347,7 @@ summing over groups, and can include polars Expression aggregations. Arbitrarily
     - For `NAME_SUM` columns, `expr` must be a function of `name` which will result in a list of expressions mapped over all card names.
     - `PICK_SUM` columns can also be functions on `name`, in which case the value will be a function of the value of the `PICK` field.
     - `AGG` columns that depend on `NAME_SUM` columns reference the prefix (`cdef.name`) only, since the unpivot has occured prior to selection.
+    - `AGG` columns must not be functions, since they may be applied to the aggregation of several sets' data. (And they shouldn't need this anyway)
     - The possible arguments to `expr`, in addition to `name` when appropriate, are as follows:
         - `names`: An array of all card names in the canonical order.
         - `card_context`: A dictionary keyed by card name which contains card dict objects with all `CARD_ATTR` values, including custom extensions and metric columns passed by the `card_context` argument to `summon`. See example notebooks for more details.

{spells_mtg-0.6.1 → spells_mtg-0.7.1}/pyproject.toml RENAMED Viewed

@@ -11,7 +11,7 @@ dependencies = [
 ]
 requires-python = ">=3.11"
 readme = "README.md"
-version = "0.6.1"
+version = "0.7.1"
 [project.license]
 text = "MIT"

{spells_mtg-0.6.1 → spells_mtg-0.7.1}/spells/columns.py RENAMED Viewed

@@ -25,8 +25,6 @@ class ColDef:
 default_columns = [
-    ColName.COLOR,
-    ColName.RARITY,
     ColName.NUM_SEEN,
     ColName.ALSA,
     ColName.NUM_TAKEN,

{spells_mtg-0.6.1 → spells_mtg-0.7.1}/spells/draft_data.py RENAMED Viewed

@@ -139,6 +139,7 @@ def _determine_expression(
     elif spec.expr is not None:
         if isinstance(spec.expr, Callable):
+            assert not spec.col_type == ColType.AGG, f"AGG column {col} must be a pure spells expression"
             params = seed_params(spec.expr)
             if (
                 spec.col_type == ColType.PICK_SUM
@@ -306,7 +307,7 @@ def _view_select(
     is_agg_view: bool,
 ) -> DF:
     base_cols = frozenset()
-    cdefs = [col_def_map[c] for c in view_cols]
+    cdefs = [col_def_map[c] for c in sorted(view_cols)]
     select = []
     for cdef in cdefs:
         if is_agg_view:
@@ -390,8 +391,10 @@ def _base_agg_df(
             )
             sum_col_df = base_df.select(nonname_gb + name_col_tuple + sum_cols)
+            grouped = sum_col_df.group_by(group_by) if group_by else sum_col_df
             join_dfs.append(
-                sum_col_df.group_by(group_by).sum().collect(streaming=use_streaming)
+                grouped.sum().collect(streaming=use_streaming)
             )
         name_sum_cols = tuple(
@@ -420,25 +423,26 @@ def _base_agg_df(
             )
             if not is_name_gb:
-                df = (
-                    unpivoted.drop("name")
-                    .group_by(nonname_gb)
-                    .sum()
-                    .collect(streaming=use_streaming)
-                )
+                grouped = unpivoted.drop("name").group_by(nonname_gb) if nonname_gb else unpivoted.drop("name")
+                df = grouped.sum() .collect(streaming=use_streaming)
             else:
                 df = unpivoted.collect(streaming=use_streaming)
             join_dfs.append(df)
-    return functools.reduce(
-        lambda prev, curr: prev.join(curr, on=group_by, how="outer", coalesce=True),
-        join_dfs,
-    )
+    if group_by:
+        joined_df = functools.reduce(
+            lambda prev, curr: prev.join(curr, on=group_by, how="outer", coalesce=True),
+            join_dfs,
+        )
+    else:
+        joined_df = pl.concat(join_dfs, how='horizontal')
+    return joined_df
 def summon(
-    set_code: str,
+    set_code: str | list[str],
     columns: list[str] | None = None,
     group_by: list[str] | None = None,
     filter_spec: dict | None = None,
@@ -446,7 +450,7 @@ def summon(
     use_streaming: bool = False,
     read_cache: bool = True,
     write_cache: bool = True,
-    card_context: pl.DataFrame | dict[str, dict] | None = None,
+    card_context: pl.DataFrame | dict[str, Any] | None = None,
     set_context: pl.DataFrame | dict[str, Any] | None = None,
 ) -> pl.DataFrame:
     specs = get_specs()
@@ -457,38 +461,72 @@ def summon(
         for ext in extensions:
             specs.update(ext)
-    col_def_map = _hydrate_col_defs(set_code, specs, card_context, set_context)
-    m = spells.manifest.create(col_def_map, columns, group_by, filter_spec)
-    calc_fn = functools.partial(_base_agg_df, set_code, m, use_streaming=use_streaming)
-    agg_df = _fetch_or_cache(
-        calc_fn,
-        set_code,
-        (
-            set_code,
-            sorted(m.view_cols.get(View.DRAFT, set())),
-            sorted(m.view_cols.get(View.GAME, set())),
-            sorted(c.signature or "" for c in m.col_def_map.values()),
-            sorted(m.base_view_group_by),
-            filter_spec,
-        ),
-        read_cache=read_cache,
-        write_cache=write_cache,
-    )
+    if isinstance(set_code, str):
+        card_context = {set_code: card_context}
+        set_context = {set_code: set_context}
+        codes = [set_code]
+    else:
+        codes = set_code
+    assert codes, "Please ask for at least one set"
+    m = None
-    if View.CARD in m.view_cols:
-        card_cols = m.view_cols[View.CARD].union({ColName.NAME})
-        fp = data_file_path(set_code, View.CARD)
-        card_df = pl.read_parquet(fp)
-        select_df = _view_select(card_df, card_cols, m.col_def_map, is_agg_view=False)
-        agg_df = agg_df.join(select_df, on="name", how="outer", coalesce=True)
+    concat_dfs = []
+    for code in codes:
+        if isinstance(card_context, pl.DataFrame):
+            set_card_context = card_context.filter(pl.col('expansion') == code)
+        elif isinstance(card_context, dict):
+            set_card_context = card_context[code]
+        else:
+            set_card_context = None
+        if isinstance(set_context, pl.DataFrame):
+            this_set_context = set_context.filter(pl.col('expansion') == code)
+        elif isinstance(set_context, dict):
+            this_set_context = set_context[code]
+        else:
+            this_set_context = None
+        col_def_map = _hydrate_col_defs(code, specs, set_card_context, this_set_context)
+        m = spells.manifest.create(col_def_map, columns, group_by, filter_spec)
+        calc_fn = functools.partial(_base_agg_df, code, m, use_streaming=use_streaming)
+        agg_df = _fetch_or_cache(
+            calc_fn,
+            code,
+            (
+                code,
+                sorted(m.view_cols.get(View.DRAFT, set())),
+                sorted(m.view_cols.get(View.GAME, set())),
+                sorted(c.signature or "" for c in m.col_def_map.values()),
+                sorted(m.base_view_group_by),
+                filter_spec,
+            ),
+            read_cache=read_cache,
+            write_cache=write_cache,
+        )
+        if View.CARD in m.view_cols:
+            card_cols = m.view_cols[View.CARD].union({ColName.NAME})
+            fp = data_file_path(code, View.CARD)
+            card_df = pl.read_parquet(fp)
+            select_df = _view_select(card_df, card_cols, m.col_def_map, is_agg_view=False)
+            agg_df = agg_df.join(select_df, on="name", how="outer", coalesce=True)
+        concat_dfs.append(agg_df)
-        if ColName.NAME not in m.group_by:
-            agg_df = agg_df.group_by(m.group_by).sum()
+    full_agg_df = pl.concat(concat_dfs, how='vertical')
+    assert m is not None, "What happened? We mean to use one of the sets manifest, it shouldn't matter which."
+    if m.group_by:
+        full_agg_df = full_agg_df.group_by(m.group_by).sum()
+    else:
+        full_agg_df = full_agg_df.sum()
     ret_cols = m.group_by + m.columns
     ret_df = (
-        _view_select(agg_df, frozenset(ret_cols), m.col_def_map, is_agg_view=True)
+        _view_select(full_agg_df, frozenset(ret_cols), m.col_def_map, is_agg_view=True)
         .select(ret_cols)
         .sort(m.group_by)
     )

{spells_mtg-0.6.1 → spells_mtg-0.7.1}/spells/external.py RENAMED Viewed

@@ -15,6 +15,7 @@ from enum import StrEnum
 import wget
 import polars as pl
+from polars.exceptions import ComputeError
 from spells import cards
 from spells import cache
@@ -231,7 +232,15 @@ def _process_zipped_file(gzip_path, target_path):
     os.remove(gzip_path)
     df = pl.scan_csv(csv_path, schema=schema(csv_path))
-    df.sink_parquet(target_path)
+    try:
+        df.sink_parquet(target_path)
+    except ComputeError:
+        df = pl.scan_csv(csv_path)
+        cache.spells_print('error', 'Bad schema found, loading dataset into memory'\
+        + ' and attempting to cast to correct schema')
+        select = [pl.col(name).cast(dtype) for name, dtype in schema(csv_path).items()]
+        cast_df = df.select(select).collect()
+        cast_df.write_parquet(target_path)
     os.remove(csv_path)

{spells_mtg-0.6.1 → spells_mtg-0.7.1}/spells/manifest.py RENAMED Viewed

@@ -162,13 +162,10 @@ def create(
     group_by: list[str] | None = None,
     filter_spec: dict | None = None,
 ):
-    gbs = (ColName.NAME,) if group_by is None else tuple(group_by)
+    gbs = (ColName.NAME, ColName.COLOR, ColName.RARITY) if group_by is None else tuple(group_by)
     if columns is None:
         cols = tuple(spells.columns.default_columns)
-        if ColName.NAME not in gbs:
-            cols = tuple(
-                c for c in cols if col_def_map[c].col_type != ColType.CARD_ATTR
-            )
     else:
         cols = tuple(columns)

{spells_mtg-0.6.1 → spells_mtg-0.7.1}/spells/schema.py RENAMED Viewed

@@ -136,16 +136,14 @@ COLUMN_TYPES = (
     (re.compile(r"^oppo_total_cards_drawn_or_tutored$"), pl.Int8),
 )
 def schema(
     filename: str, print_missing: bool = False
-) -> Dict[str, pl.datatypes.DataType] | None:
+) -> Dict[str, pl.datatypes.DataType]:
     dtypes: Dict[str, pl.datatypes.DataType] = {}
     with open(filename, encoding="utf-8") as f:
         columns = csv.DictReader(f).fieldnames
     if columns is None:
-        print(f"Could not read fieldnames from {filename}")
-        return None
+        raise ValueError(f"Could not read fieldnames from {filename}")
     for column in columns:
         for regex, column_type in COLUMN_TYPES:
             if regex.match(column):