PyPI - fairspec-library - Versions diffs - 0.1.1__tar.gz - Mend

fairspec-library 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

fairspec_library-0.1.1/.gitignore ADDED Viewed

@@ -0,0 +1,229 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# Redis
+*.rdb
+*.aof
+*.pid
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+# ActiveMQ
+activemq-data/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml
+# Node
+node_modules/
+jspm_packages/
+.lock-wscript
+build/Release
+.node_repl_history
+*.tgz
+.npm
+*.so
+# User
+/.claude/settings.local.json

fairspec_library-0.1.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,22 @@
+Metadata-Version: 2.4
+Name: fairspec-library
+Version: 0.1.1
+Summary: Fairspec Python is a fast data management framework built on top of the Fairspec standard and Polars DataFrames
+Project-URL: homepage, https://github.com/fairspec/fairspec-python
+Project-URL: repository, https://github.com/fairspec/fairspec-python
+Author: Evgeny Karev
+License-Expression: MIT
+Keywords: data,dataframe,fair,fairspec,jsonschema,library,metadata,polars,python,quality,tableschema,validation
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.12
+Requires-Dist: fairspec-dataset
+Requires-Dist: fairspec-metadata
+Requires-Dist: fairspec-table
+Requires-Dist: genson>=1.3
+Description-Content-Type: text/markdown
+# fairspec-library
+Fairspec Python is a fast data management framework built on top of the Fairspec standard and Polars DataFrames. It supports various formats like CSV, JSON, and Parquet and integrates with data platforms such as CKAN, Zenodo, and GitHub. For more information, please read the [project's documentation](https://python.fairspec.org).

fairspec_library-0.1.1/README.md ADDED Viewed

@@ -0,0 +1,3 @@
+# fairspec-library
+Fairspec Python is a fast data management framework built on top of the Fairspec standard and Polars DataFrames. It supports various formats like CSV, JSON, and Parquet and integrates with data platforms such as CKAN, Zenodo, and GitHub. For more information, please read the [project's documentation](https://python.fairspec.org).

fairspec_library-0.1.1/fairspec_library/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+from fairspec_dataset import *  # noqa: F403
+from fairspec_metadata import *  # noqa: F403
+from fairspec_table import *  # noqa: F403
+from .actions.data.load import load_data as load_data
+from .actions.data.validate import validate_data as validate_data
+from .actions.data_schema.infer import infer_data_schema as infer_data_schema
+from .actions.data_schema.render import render_data_schema_as as render_data_schema_as
+from .actions.dataset.foreign_key import (
+    validate_dataset_foreign_keys as validate_dataset_foreign_keys,
+)
+from .actions.dataset.infer import infer_dataset as infer_dataset
+from .actions.dataset.load import load_dataset as load_dataset
+from .actions.dataset.render import render_dataset_as as render_dataset_as
+from .actions.dataset.save import save_dataset as save_dataset
+from .actions.dataset.validate import validate_dataset as validate_dataset
+from .actions.file_dialect.infer import infer_file_dialect as infer_file_dialect
+from .actions.resource.infer import infer_resource as infer_resource
+from .actions.resource.validate import validate_resource as validate_resource
+from .actions.table.infer import infer_table as infer_table
+from .actions.table.load import load_table as load_table
+from .actions.table.save import save_table as save_table
+from .actions.table.validate import validate_table as validate_table
+from .actions.table_schema.infer import infer_table_schema as infer_table_schema
+from .actions.table_schema.render import render_table_schema_as as render_table_schema_as
+from .models.table import ValidateTableOptions as ValidateTableOptions
+from .plugin import Plugin as Plugin
+from .system import System as System
+from .system import system as system

fairspec_library-0.1.1/fairspec_library/actions/__init__.py ADDED Viewed

File without changes

fairspec_library-0.1.1/fairspec_library/actions/data/__init__.py ADDED Viewed

File without changes

fairspec_library-0.1.1/fairspec_library/actions/data/load.py ADDED Viewed

@@ -0,0 +1,27 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from fairspec_metadata import (
+    get_data_first_path,
+    get_data_value,
+    get_supported_file_dialect,
+    load_descriptor,
+)
+if TYPE_CHECKING:
+    from fairspec_metadata import Resource
+def load_data(resource: Resource) -> object | None:
+    data_value = get_data_value(resource)
+    if data_value:
+        return data_value
+    first_path = get_data_first_path(resource)
+    if first_path:
+        dialect = get_supported_file_dialect(resource, ["json"])
+        if dialect:
+            return load_descriptor(first_path)
+    return None

fairspec_library-0.1.1/fairspec_library/actions/data/load_spec.py ADDED Viewed

@@ -0,0 +1,38 @@
+from __future__ import annotations
+import json
+from fairspec_dataset import write_temp_file
+from fairspec_metadata import Resource
+from .load import load_data
+class TestLoadData:
+    def test_should_return_inline_data(self):
+        resource = Resource(data=[{"id": 1}, {"id": 2}])
+        result = load_data(resource)
+        assert result == [{"id": 1}, {"id": 2}]
+    def test_should_return_inline_object(self):
+        resource = Resource(data={"key": "value"})
+        result = load_data(resource)
+        assert result == {"key": "value"}
+    def test_should_load_json_file(self):
+        data = {"key": "value"}
+        path = write_temp_file(json.dumps(data), format="json")
+        resource = Resource(data=path)
+        result = load_data(resource)
+        assert result == data
+    def test_should_return_none_for_non_json_file(self):
+        path = write_temp_file("id,name\n1,english", format="csv")
+        resource = Resource(data=path)
+        result = load_data(resource)
+        assert result is None
+    def test_should_return_none_for_empty_resource(self):
+        resource = Resource()
+        result = load_data(resource)
+        assert result is None

fairspec_library-0.1.1/fairspec_library/actions/data/validate.py ADDED Viewed

@@ -0,0 +1,41 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from fairspec_metadata import (
+    DataError,
+    FairspecError,
+    Report,
+    create_report,
+    inspect_json,
+    resolve_data_schema,
+)
+from .load import load_data
+if TYPE_CHECKING:
+    from fairspec_metadata import Resource
+def validate_data(resource: Resource) -> Report:
+    errors: list[FairspecError] = []
+    data_schema = resolve_data_schema(resource.dataSchema)
+    if not data_schema:
+        return create_report()
+    data = load_data(resource)
+    if data is None:
+        return create_report()
+    notes = inspect_json(data, json_schema=data_schema)
+    for note in notes:
+        errors.append(
+            DataError(
+                type="data",
+                message=note["message"],
+                jsonPointer=note["jsonPointer"],
+            )
+        )
+    return create_report(errors)

fairspec_library-0.1.1/fairspec_library/actions/data/validate_spec.py ADDED Viewed

@@ -0,0 +1,46 @@
+from __future__ import annotations
+from fairspec_metadata import Resource
+from .validate import validate_data
+class TestValidateData:
+    def test_should_return_valid_when_no_schema(self):
+        resource = Resource(data=[{"id": 1}])
+        report = validate_data(resource)
+        assert report.valid is True
+    def test_should_validate_inline_data(self):
+        resource = Resource(
+            data={"name": "test", "age": 25},
+            dataSchema={
+                "type": "object",
+                "properties": {
+                    "name": {"type": "string"},
+                    "age": {"type": "integer"},
+                },
+                "required": ["name", "age"],
+            },
+        )
+        report = validate_data(resource)
+        assert report.valid is True
+    def test_should_detect_invalid_data(self):
+        resource = Resource(
+            data={"name": 123},
+            dataSchema={
+                "type": "object",
+                "properties": {"name": {"type": "string"}},
+            },
+        )
+        report = validate_data(resource)
+        assert report.valid is False
+        assert len(report.errors) > 0
+    def test_should_return_valid_for_no_data(self):
+        resource = Resource(
+            dataSchema={"type": "object"},
+        )
+        report = validate_data(resource)
+        assert report.valid is True

fairspec_library-0.1.1/fairspec_library/actions/data_schema/__init__.py ADDED Viewed

File without changes

fairspec_library-0.1.1/fairspec_library/actions/data_schema/infer.py ADDED Viewed

@@ -0,0 +1,23 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from genson import SchemaBuilder
+from fairspec_library.actions.data.load import load_data
+if TYPE_CHECKING:
+    from fairspec_metadata import JsonSchema, Resource
+def infer_data_schema(resource: Resource) -> JsonSchema | None:
+    data = load_data(resource)
+    if not data:
+        return None
+    try:
+        builder = SchemaBuilder()
+        builder.add_object(data)
+        return builder.to_schema()
+    except Exception:
+        return None

fairspec_library-0.1.1/fairspec_library/actions/data_schema/infer_spec.py ADDED Viewed

@@ -0,0 +1,40 @@
+from __future__ import annotations
+import json
+from fairspec_dataset import write_temp_file
+from fairspec_metadata import Resource
+from .infer import infer_data_schema
+class TestInferDataSchema:
+    def test_should_infer_schema_from_inline_data(self):
+        resource = Resource(data={"name": "test", "age": 25})
+        schema = infer_data_schema(resource)
+        assert schema is not None
+        assert schema.get("type") == "object"
+    def test_should_infer_schema_from_inline_array(self):
+        resource = Resource(data=[{"id": 1}, {"id": 2}])
+        schema = infer_data_schema(resource)
+        assert schema is not None
+    def test_should_infer_schema_from_json_file(self):
+        data = {"name": "test", "value": 42}
+        path = write_temp_file(json.dumps(data), format="json")
+        resource = Resource(data=path)
+        schema = infer_data_schema(resource)
+        assert schema is not None
+        assert schema.get("type") == "object"
+    def test_should_return_none_for_no_data(self):
+        resource = Resource()
+        schema = infer_data_schema(resource)
+        assert schema is None
+    def test_should_return_none_for_csv_file(self):
+        path = write_temp_file("id,name\n1,english", format="csv")
+        resource = Resource(data=path)
+        schema = infer_data_schema(resource)
+        assert schema is None

fairspec_library-0.1.1/fairspec_library/actions/data_schema/render.py ADDED Viewed

@@ -0,0 +1,19 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from fairspec_library.system import system
+if TYPE_CHECKING:
+    from fairspec_metadata import DataSchema, RenderDataSchemaOptions
+def render_data_schema_as(
+    data_schema: DataSchema, options: RenderDataSchemaOptions
+) -> str | None:
+    for plugin in system.plugins:
+        result = plugin.render_data_schema_as(data_schema, options)
+        if result is not None:
+            return result
+    return None

fairspec_library-0.1.1/fairspec_library/actions/dataset/__init__.py ADDED Viewed

File without changes

fairspec_library-0.1.1/fairspec_library/actions/dataset/foreign_key.py ADDED Viewed

@@ -0,0 +1,98 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING, Unpack
+import polars as pl
+from fairspec_metadata import (
+    FairspecError,
+    ForeignKey,
+    ForeignKeyError,
+    Report,
+    create_report,
+    resolve_table_schema,
+)
+from fairspec_library.actions.table.load import load_table
+from fairspec_library.models.table import ValidateTableOptions
+if TYPE_CHECKING:
+    from fairspec_metadata import Dataset, Resource
+def validate_dataset_foreign_keys(
+    dataset: Dataset, **options: Unpack[ValidateTableOptions]
+) -> Report:
+    errors: list[FairspecError] = []
+    for resource in dataset.resources or []:
+        table_schema = resolve_table_schema(resource.tableSchema)
+        if not table_schema:
+            continue
+        for foreign_key in table_schema.foreignKeys or []:
+            fk_errors = _validate_foreign_key(
+                resource, foreign_key, dataset, **options
+            )
+            errors.extend(fk_errors)
+    return create_report(errors)
+def _validate_foreign_key(
+    resource: Resource,
+    foreign_key: ForeignKey,
+    dataset: Dataset,
+    **options: Unpack[ValidateTableOptions],
+) -> list[ForeignKeyError]:
+    reference = foreign_key.reference
+    columns = foreign_key.columns
+    ref_columns = reference.columns
+    if not columns or not ref_columns or len(columns) != len(ref_columns):
+        return []
+    ref_resource = _find_resource(dataset, reference.resource)
+    if not ref_resource:
+        return []
+    table = load_table(resource, denormalized=True, **options)
+    if table is None:
+        return []
+    ref_table = load_table(ref_resource, denormalized=True, **options)
+    if ref_table is None:
+        return []
+    rename_mapping = dict(zip(ref_columns, columns))
+    ref_selected = ref_table.select(
+        [pl.col(name).alias(rename_mapping[name]) for name in ref_columns]
+    ).unique()
+    violations: pl.DataFrame = table.select(columns).join(  # ty: ignore[invalid-assignment] https://github.com/astral-sh/ty/issues/2278
+        ref_selected, on=columns, how="anti"
+    ).unique().collect()
+    errors: list[ForeignKeyError] = []
+    for row in violations.to_dicts():
+        cells = [str(row[c]) for c in columns]
+        errors.append(
+            ForeignKeyError(
+                type="foreignKey",
+                resourceName=resource.name,
+                foreignKey=foreign_key,
+                cells=cells,
+            )
+        )
+    return errors
+def _find_resource(dataset: Dataset, name: str | None) -> Resource | None:
+    if not name:
+        return None
+    for resource in dataset.resources or []:
+        if resource.name == name:
+            return resource
+    return None