PyPI - welearn-database - Versions diffs - 0.1.0__py3-none-any.whl - Mend

welearn-database 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

welearn_database/__init__.py +0 -0
welearn_database/alembic/README +1 -0
welearn_database/alembic/env.py +94 -0
welearn_database/alembic/script.py.mako +26 -0
welearn_database/alembic/versions/16ff997426d3_remove_error_retrieval_unique_constraint.py +38 -0
welearn_database/alembic/versions/4c7161819e5a_grafana_views.py +189 -0
welearn_database/alembic/versions/4fcbfb7f3145_added_api_key_management_table.py +51 -0
welearn_database/alembic/versions/5d82613c9aca_context_document.py +67 -0
welearn_database/alembic/versions/821173cf9c5d_initial_migration.py +441 -0
welearn_database/alembic/versions/89920abb7ff8_add_category.py +54 -0
welearn_database/alembic/versions/a50a1db3ca2a_add_used_since_column_for_embeddings.py +34 -0
welearn_database/alembic/versions/b031206324b7_agent_related.py +80 -0
welearn_database/alembic/versions/e354666f951d_inferred_user.py +88 -0
welearn_database/data/__init__.py +0 -0
welearn_database/data/enumeration.py +22 -0
welearn_database/data/models/__init__.py +18 -0
welearn_database/data/models/agent_related.py +0 -0
welearn_database/data/models/corpus_related.py +98 -0
welearn_database/data/models/document_related.py +454 -0
welearn_database/data/models/grafana.py +0 -0
welearn_database/data/models/user_related.py +216 -0
welearn_database/database_utils.py +34 -0
welearn_database/exceptions.py +7 -0
welearn_database/modules/__init__.py +0 -0
welearn_database/modules/text_cleaning.py +89 -0
welearn_database-0.1.0.dist-info/LICENSE +438 -0
welearn_database-0.1.0.dist-info/METADATA +44 -0
welearn_database-0.1.0.dist-info/RECORD +29 -0
welearn_database-0.1.0.dist-info/WHEEL +4 -0

welearn_database/__init__.py ADDED Viewed

File without changes

welearn_database/alembic/README ADDED Viewed

	@@ -0,0 +1 @@
1	+ Generic single-database configuration.

welearn_database/alembic/env.py ADDED Viewed

@@ -0,0 +1,94 @@
+from logging.config import fileConfig
+from alembic import context
+from dotenv import load_dotenv
+from welearn_database.database_utils import create_sqlalchemy_engine
+EXCLUDE_SCHEMAS_NAMES = ["public"]
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+# here we allow ourselves to pass interpolation vars to alembic.ini
+# fron the host env
+section = config.config_ini_section
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+# add your model's MetaData object here
+# for 'autogenerate' support
+from welearn_database.data.models import Base
+target_metadata = Base.metadata
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+    Calls to context.execute() here emit the given string to the
+    script output.
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+    with context.begin_transaction():
+        context.run_migrations()
+def include_name(name, type_, parent_names):
+    if type_ == "schema" and name in EXCLUDE_SCHEMAS_NAMES:
+        return False
+    elif type_ == "index":
+        return False
+    return True
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+    """
+    load_dotenv()
+    connectable = create_sqlalchemy_engine()
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection,
+            target_metadata=target_metadata,
+            version_table_schema="public",
+            include_schemas=True,
+            include_name=include_name,
+        )
+        with context.begin_transaction():
+            context.run_migrations()
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()

welearn_database/alembic/script.py.mako ADDED Viewed

@@ -0,0 +1,26 @@
+"""${message}
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+"""
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+def upgrade() -> None:
+    ${upgrades if upgrades else "pass"}
+def downgrade() -> None:
+    ${downgrades if downgrades else "pass"}

welearn_database/alembic/versions/16ff997426d3_remove_error_retrieval_unique_constraint.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""Remove error retrieval unique constraint
+Revision ID: 16ff997426d3
+Revises: a50a1db3ca2a
+Create Date: 2025-06-02 14:23:49.689745
+"""
+from typing import Sequence, Union
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from alembic import op
+# revision identifiers, used by Alembic.
+revision: str = "16ff997426d3"
+down_revision: Union[str, None] = "a50a1db3ca2a"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade() -> None:
+    op.drop_index(
+        "error_retrieval_document_id_http_error_code_idx",
+        "error_retrieval",
+        schema="document_related",
+    )
+def downgrade() -> None:
+    op.create_index(
+        "error_retrieval_document_id_http_error_code_idx",
+        "error_retrieval",
+        ["document_id", "http_error_code"],
+        unique=True,
+        schema="document_related",
+    )

welearn_database/alembic/versions/4c7161819e5a_grafana_views.py ADDED Viewed

@@ -0,0 +1,189 @@
+"""grafana_views
+Revision ID: 4c7161819e5a
+Revises: 5d82613c9aca
+Create Date: 2025-10-08 13:55:36.123188
+"""
+from typing import Sequence, Union
+from alembic import op
+# revision identifiers, used by Alembic.
+revision: str = "4c7161819e5a"
+down_revision: Union[str, None] = "5d82613c9aca"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade():
+    op.execute(
+        """
+    CREATE OR REPLACE VIEW grafana.corpus
+    AS SELECT corpus.id,
+    corpus.source_name,
+    corpus.is_fix,
+    corpus.binary_treshold
+   FROM corpus_related.corpus;
+    """
+    )
+    op.execute(
+        """
+CREATE OR REPLACE VIEW grafana.document_state_summary
+AS SELECT corpus.source_name,
+    count(DISTINCT
+        CASE
+            WHEN process_state.title = 'url_retrieved'::document_related.step THEN process_state.document_id
+            ELSE NULL::uuid
+        END) AS url_retrieved_count,
+    count(DISTINCT
+        CASE
+            WHEN process_state.title = 'document_scraped'::document_related.step THEN process_state.document_id
+            ELSE NULL::uuid
+        END) AS document_scraped_count,
+    count(DISTINCT
+        CASE
+            WHEN process_state.title = 'document_is_irretrievable'::document_related.step THEN process_state.document_id
+            ELSE NULL::uuid
+        END) AS document_is_irretrievable_count,
+    count(DISTINCT
+        CASE
+            WHEN process_state.title = 'document_vectorized'::document_related.step THEN process_state.document_id
+            ELSE NULL::uuid
+        END) AS document_vectorized_count,
+    count(DISTINCT
+        CASE
+            WHEN process_state.title = 'document_classified_sdg'::document_related.step THEN process_state.document_id
+            ELSE NULL::uuid
+        END) AS document_classified_sdg_count,
+    count(DISTINCT
+        CASE
+            WHEN process_state.title = 'document_classified_non_sdg'::document_related.step THEN process_state.document_id
+            ELSE NULL::uuid
+        END) AS document_classified_non_sdg_count,
+    count(DISTINCT
+        CASE
+            WHEN process_state.title = 'document_in_qdrant'::document_related.step THEN process_state.document_id
+            ELSE NULL::uuid
+        END) AS document_in_qdrant_count,
+    count(DISTINCT
+        CASE
+            WHEN process_state.title = 'kept_for_trace'::document_related.step THEN process_state.document_id
+            ELSE NULL::uuid
+        END) AS kept_for_trace_count,
+    count(DISTINCT
+        CASE
+            WHEN process_state.title IS NULL THEN welearn_document.id
+            ELSE NULL::uuid
+        END) AS no_state_count,
+    count(DISTINCT welearn_document.id) AS total_documents
+   FROM corpus_related.corpus
+     LEFT JOIN document_related.welearn_document ON corpus.id = welearn_document.corpus_id
+     LEFT JOIN document_related.process_state ON welearn_document.id = process_state.document_id
+  GROUP BY corpus.source_name;
+    """
+    )
+    op.execute(
+        """
+CREATE OR REPLACE VIEW grafana.endpoint_request
+AS SELECT endpoint_request.id,
+    endpoint_request.session_id,
+    endpoint_request.endpoint_name,
+    endpoint_request.http_code,
+    endpoint_request.message,
+    endpoint_request.created_at
+   FROM user_related.endpoint_request;
+    """
+    )
+    op.execute(
+        """
+CREATE OR REPLACE VIEW grafana.inferred_user
+AS SELECT inferred_user.id,
+    inferred_user.created_at
+   FROM user_related.inferred_user;
+    """
+    )
+    op.execute(
+        """
+CREATE OR REPLACE VIEW grafana.process_state
+AS SELECT process_state.id,
+    process_state.document_id,
+    process_state.title,
+    process_state.created_at,
+    process_state.operation_order
+   FROM document_related.process_state;
+    """
+    )
+    op.execute(
+        """
+CREATE OR REPLACE VIEW grafana.qty_endpoints_per_user
+AS SELECT iu.*::user_related.inferred_user AS iu,
+    count(1) AS count
+   FROM user_related.endpoint_request er
+     JOIN user_related.session s ON s.id = er.session_id
+     JOIN user_related.inferred_user iu ON iu.id = s.inferred_user_id
+  GROUP BY iu.id;
+    """
+    )
+    op.execute(
+        """
+CREATE OR REPLACE VIEW grafana.qty_session_endpoint_per_user
+AS SELECT s.inferred_user_id,
+    s.host,
+    count(DISTINCT s.id) AS count_sessions,
+    count(er.id) AS count_endpoints
+   FROM user_related.session s
+     LEFT JOIN user_related.endpoint_request er ON s.id = er.session_id
+  GROUP BY s.inferred_user_id, s.host;
+    """
+    )
+    op.execute(
+        """
+CREATE OR REPLACE VIEW grafana.qty_session_user_per_host
+AS SELECT s.host,
+    count(1) AS count_sessions,
+    count(DISTINCT s.inferred_user_id) AS count_users
+   FROM user_related.session s
+  GROUP BY s.host;
+    """
+    )
+    op.execute(
+        """
+CREATE OR REPLACE VIEW grafana."session"
+AS SELECT session.id,
+    session.inferred_user_id,
+    session.created_at,
+    session.end_at,
+    session.host
+   FROM user_related.session;
+    """
+    )
+    op.execute(
+        """
+CREATE OR REPLACE VIEW grafana.document_latest_state
+AS SELECT DISTINCT ON (ps.document_id) ps.id,
+    ps.document_id,
+    wd.corpus_id,
+    wd.lang,
+    ps.title,
+    ps.created_at,
+    ps.operation_order
+   FROM document_related.process_state ps
+     JOIN document_related.welearn_document wd ON ps.document_id = wd.id
+  ORDER BY ps.document_id, ps.operation_order DESC;
+    """
+    )
+def downgrade():
+    op.execute("DROP VIEW IF EXISTS grafana.document_latest_state;")
+    op.execute("DROP VIEW IF EXISTS grafana.session;")
+    op.execute("DROP VIEW IF EXISTS grafana.qty_session_user_per_host;")
+    op.execute("DROP VIEW IF EXISTS grafana.qty_session_endpoint_per_user;")
+    op.execute("DROP VIEW IF EXISTS grafana.qty_endpoints_per_user;")
+    op.execute("DROP VIEW IF EXISTS grafana.process_state;")
+    op.execute("DROP VIEW IF EXISTS grafana.inferred_user;")
+    op.execute("DROP VIEW IF EXISTS grafana.endpoint_request;")
+    op.execute("DROP VIEW IF EXISTS grafana.document_state_summary;")
+    op.execute("DROP VIEW IF EXISTS grafana.corpus;")

welearn_database/alembic/versions/4fcbfb7f3145_added_api_key_management_table.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""Added API Key Management table
+Revision ID: 4fcbfb7f3145
+Revises: 821173cf9c5d
+Create Date: 2025-03-18 16:31:23.532135
+"""
+from typing import Sequence, Union
+import sqlalchemy
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from alembic import op
+# revision identifiers, used by Alembic.
+revision: str = "4fcbfb7f3145"
+down_revision: Union[str, None] = "821173cf9c5d"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "api_key_management",
+        sa.Column(
+            "id",
+            sa.Uuid(),
+            server_default=sqlalchemy.func.gen_random_uuid(),
+            nullable=False,
+        ),
+        sa.Column("title", sa.String(), nullable=True),
+        sa.Column("is_active", sa.Boolean(), nullable=False),
+        sa.Column("register_email", sa.String(), nullable=False),
+        sa.Column("digest", sa.LargeBinary(), nullable=False),
+        sa.Column(
+            "created_at", postgresql.TIMESTAMP(), server_default="NOW()", nullable=False
+        ),
+        sa.Column(
+            "updated_at", postgresql.TIMESTAMP(), server_default="NOW()", nullable=False
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        schema="user_related",
+    )
+def downgrade() -> None:
+    op.drop_table("api_key_management", schema="user_related")
+    # ### end Alembic commands ###

welearn_database/alembic/versions/5d82613c9aca_context_document.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""context_document
+Revision ID: 5d82613c9aca
+Revises: e354666f951d
+Create Date: 2025-10-08 12:14:40.215929
+"""
+from typing import Sequence, Union
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from alembic import op
+# revision identifiers, used by Alembic.
+revision: str = "5d82613c9aca"
+down_revision: Union[str, None] = "e354666f951d"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "context_document",
+        sa.Column(
+            "id",
+            sa.Uuid(),
+            server_default=sa.func.gen_random_uuid(),
+            nullable=False,
+        ),
+        sa.Column("embedding_model_id", sa.Uuid(), nullable=False),
+        sa.Column("url", sa.String()),
+        sa.Column("full_content", sa.String()),
+        sa.Column("title", sa.String(), nullable=False),
+        sa.Column("sdg_related", sa.ARRAY(sa.INTEGER()), nullable=False),
+        sa.Column(
+            "created_at", postgresql.TIMESTAMP(), server_default="NOW()", nullable=False
+        ),
+        sa.Column("embedding", sa.LargeBinary(), nullable=True),
+        sa.Column(
+            "updated_at", postgresql.TIMESTAMP(), server_default="NOW()", nullable=False
+        ),
+        sa.Column(
+            "context_type",
+            postgresql.ENUM(
+                "introduction",
+                "target",
+                "subject",
+                name="context_type",
+                schema="document_related",
+            ),
+            nullable=False,
+        ),
+        sa.ForeignKeyConstraint(
+            ["embedding_model_id"],
+            ["corpus_related.embedding_model.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        schema="document_related",
+    )
+def downgrade() -> None:
+    op.drop_table("context_document", schema="document_related")
+    # ### end Alembic commands ###