welearn-database 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. welearn_database/__init__.py +0 -0
  2. welearn_database/alembic/README +1 -0
  3. welearn_database/alembic/env.py +94 -0
  4. welearn_database/alembic/script.py.mako +26 -0
  5. welearn_database/alembic/versions/16ff997426d3_remove_error_retrieval_unique_constraint.py +38 -0
  6. welearn_database/alembic/versions/4c7161819e5a_grafana_views.py +189 -0
  7. welearn_database/alembic/versions/4fcbfb7f3145_added_api_key_management_table.py +51 -0
  8. welearn_database/alembic/versions/5d82613c9aca_context_document.py +67 -0
  9. welearn_database/alembic/versions/821173cf9c5d_initial_migration.py +441 -0
  10. welearn_database/alembic/versions/89920abb7ff8_add_category.py +54 -0
  11. welearn_database/alembic/versions/a50a1db3ca2a_add_used_since_column_for_embeddings.py +34 -0
  12. welearn_database/alembic/versions/b031206324b7_agent_related.py +80 -0
  13. welearn_database/alembic/versions/e354666f951d_inferred_user.py +88 -0
  14. welearn_database/data/__init__.py +0 -0
  15. welearn_database/data/enumeration.py +22 -0
  16. welearn_database/data/models/__init__.py +18 -0
  17. welearn_database/data/models/agent_related.py +0 -0
  18. welearn_database/data/models/corpus_related.py +98 -0
  19. welearn_database/data/models/document_related.py +454 -0
  20. welearn_database/data/models/grafana.py +0 -0
  21. welearn_database/data/models/user_related.py +216 -0
  22. welearn_database/database_utils.py +34 -0
  23. welearn_database/exceptions.py +7 -0
  24. welearn_database/modules/__init__.py +0 -0
  25. welearn_database/modules/text_cleaning.py +89 -0
  26. welearn_database-0.1.0.dist-info/LICENSE +438 -0
  27. welearn_database-0.1.0.dist-info/METADATA +44 -0
  28. welearn_database-0.1.0.dist-info/RECORD +29 -0
  29. welearn_database-0.1.0.dist-info/WHEEL +4 -0
File without changes
@@ -0,0 +1 @@
1
+ Generic single-database configuration.
@@ -0,0 +1,94 @@
1
+ from logging.config import fileConfig
2
+
3
+ from alembic import context
4
+ from dotenv import load_dotenv
5
+
6
+ from welearn_database.database_utils import create_sqlalchemy_engine
7
+
8
+ EXCLUDE_SCHEMAS_NAMES = ["public"]
9
+
10
+ # this is the Alembic Config object, which provides
11
+ # access to the values within the .ini file in use.
12
+ config = context.config
13
+
14
+ # here we allow ourselves to pass interpolation vars to alembic.ini
15
+ # fron the host env
16
+ section = config.config_ini_section
17
+
18
+ # Interpret the config file for Python logging.
19
+ # This line sets up loggers basically.
20
+ if config.config_file_name is not None:
21
+ fileConfig(config.config_file_name)
22
+
23
+ # add your model's MetaData object here
24
+ # for 'autogenerate' support
25
+ from welearn_database.data.models import Base
26
+
27
+ target_metadata = Base.metadata
28
+
29
+ # other values from the config, defined by the needs of env.py,
30
+ # can be acquired:
31
+ # my_important_option = config.get_main_option("my_important_option")
32
+ # ... etc.
33
+
34
+
35
+ def run_migrations_offline() -> None:
36
+ """Run migrations in 'offline' mode.
37
+
38
+ This configures the context with just a URL
39
+ and not an Engine, though an Engine is acceptable
40
+ here as well. By skipping the Engine creation
41
+ we don't even need a DBAPI to be available.
42
+
43
+ Calls to context.execute() here emit the given string to the
44
+ script output.
45
+
46
+ """
47
+ url = config.get_main_option("sqlalchemy.url")
48
+ context.configure(
49
+ url=url,
50
+ target_metadata=target_metadata,
51
+ literal_binds=True,
52
+ dialect_opts={"paramstyle": "named"},
53
+ )
54
+
55
+ with context.begin_transaction():
56
+ context.run_migrations()
57
+
58
+
59
+ def include_name(name, type_, parent_names):
60
+ if type_ == "schema" and name in EXCLUDE_SCHEMAS_NAMES:
61
+ return False
62
+ elif type_ == "index":
63
+ return False
64
+
65
+ return True
66
+
67
+
68
+ def run_migrations_online() -> None:
69
+ """Run migrations in 'online' mode.
70
+
71
+ In this scenario we need to create an Engine
72
+ and associate a connection with the context.
73
+
74
+ """
75
+ load_dotenv()
76
+ connectable = create_sqlalchemy_engine()
77
+
78
+ with connectable.connect() as connection:
79
+ context.configure(
80
+ connection=connection,
81
+ target_metadata=target_metadata,
82
+ version_table_schema="public",
83
+ include_schemas=True,
84
+ include_name=include_name,
85
+ )
86
+
87
+ with context.begin_transaction():
88
+ context.run_migrations()
89
+
90
+
91
+ if context.is_offline_mode():
92
+ run_migrations_offline()
93
+ else:
94
+ run_migrations_online()
@@ -0,0 +1,26 @@
1
+ """${message}
2
+
3
+ Revision ID: ${up_revision}
4
+ Revises: ${down_revision | comma,n}
5
+ Create Date: ${create_date}
6
+
7
+ """
8
+ from typing import Sequence, Union
9
+
10
+ from alembic import op
11
+ import sqlalchemy as sa
12
+ ${imports if imports else ""}
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision: str = ${repr(up_revision)}
16
+ down_revision: Union[str, None] = ${repr(down_revision)}
17
+ branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
18
+ depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
19
+
20
+
21
+ def upgrade() -> None:
22
+ ${upgrades if upgrades else "pass"}
23
+
24
+
25
+ def downgrade() -> None:
26
+ ${downgrades if downgrades else "pass"}
@@ -0,0 +1,38 @@
1
+ """Remove error retrieval unique constraint
2
+
3
+ Revision ID: 16ff997426d3
4
+ Revises: a50a1db3ca2a
5
+ Create Date: 2025-06-02 14:23:49.689745
6
+
7
+ """
8
+
9
+ from typing import Sequence, Union
10
+
11
+ import sqlalchemy as sa
12
+ from sqlalchemy.dialects import postgresql
13
+
14
+ from alembic import op
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = "16ff997426d3"
18
+ down_revision: Union[str, None] = "a50a1db3ca2a"
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade() -> None:
24
+ op.drop_index(
25
+ "error_retrieval_document_id_http_error_code_idx",
26
+ "error_retrieval",
27
+ schema="document_related",
28
+ )
29
+
30
+
31
+ def downgrade() -> None:
32
+ op.create_index(
33
+ "error_retrieval_document_id_http_error_code_idx",
34
+ "error_retrieval",
35
+ ["document_id", "http_error_code"],
36
+ unique=True,
37
+ schema="document_related",
38
+ )
@@ -0,0 +1,189 @@
1
+ """grafana_views
2
+
3
+ Revision ID: 4c7161819e5a
4
+ Revises: 5d82613c9aca
5
+ Create Date: 2025-10-08 13:55:36.123188
6
+
7
+ """
8
+
9
+ from typing import Sequence, Union
10
+
11
+ from alembic import op
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision: str = "4c7161819e5a"
15
+ down_revision: Union[str, None] = "5d82613c9aca"
16
+ branch_labels: Union[str, Sequence[str], None] = None
17
+ depends_on: Union[str, Sequence[str], None] = None
18
+
19
+
20
+ def upgrade():
21
+ op.execute(
22
+ """
23
+ CREATE OR REPLACE VIEW grafana.corpus
24
+ AS SELECT corpus.id,
25
+ corpus.source_name,
26
+ corpus.is_fix,
27
+ corpus.binary_treshold
28
+ FROM corpus_related.corpus;
29
+ """
30
+ )
31
+ op.execute(
32
+ """
33
+ CREATE OR REPLACE VIEW grafana.document_state_summary
34
+ AS SELECT corpus.source_name,
35
+ count(DISTINCT
36
+ CASE
37
+ WHEN process_state.title = 'url_retrieved'::document_related.step THEN process_state.document_id
38
+ ELSE NULL::uuid
39
+ END) AS url_retrieved_count,
40
+ count(DISTINCT
41
+ CASE
42
+ WHEN process_state.title = 'document_scraped'::document_related.step THEN process_state.document_id
43
+ ELSE NULL::uuid
44
+ END) AS document_scraped_count,
45
+ count(DISTINCT
46
+ CASE
47
+ WHEN process_state.title = 'document_is_irretrievable'::document_related.step THEN process_state.document_id
48
+ ELSE NULL::uuid
49
+ END) AS document_is_irretrievable_count,
50
+ count(DISTINCT
51
+ CASE
52
+ WHEN process_state.title = 'document_vectorized'::document_related.step THEN process_state.document_id
53
+ ELSE NULL::uuid
54
+ END) AS document_vectorized_count,
55
+ count(DISTINCT
56
+ CASE
57
+ WHEN process_state.title = 'document_classified_sdg'::document_related.step THEN process_state.document_id
58
+ ELSE NULL::uuid
59
+ END) AS document_classified_sdg_count,
60
+ count(DISTINCT
61
+ CASE
62
+ WHEN process_state.title = 'document_classified_non_sdg'::document_related.step THEN process_state.document_id
63
+ ELSE NULL::uuid
64
+ END) AS document_classified_non_sdg_count,
65
+ count(DISTINCT
66
+ CASE
67
+ WHEN process_state.title = 'document_in_qdrant'::document_related.step THEN process_state.document_id
68
+ ELSE NULL::uuid
69
+ END) AS document_in_qdrant_count,
70
+ count(DISTINCT
71
+ CASE
72
+ WHEN process_state.title = 'kept_for_trace'::document_related.step THEN process_state.document_id
73
+ ELSE NULL::uuid
74
+ END) AS kept_for_trace_count,
75
+ count(DISTINCT
76
+ CASE
77
+ WHEN process_state.title IS NULL THEN welearn_document.id
78
+ ELSE NULL::uuid
79
+ END) AS no_state_count,
80
+ count(DISTINCT welearn_document.id) AS total_documents
81
+ FROM corpus_related.corpus
82
+ LEFT JOIN document_related.welearn_document ON corpus.id = welearn_document.corpus_id
83
+ LEFT JOIN document_related.process_state ON welearn_document.id = process_state.document_id
84
+ GROUP BY corpus.source_name;
85
+ """
86
+ )
87
+ op.execute(
88
+ """
89
+ CREATE OR REPLACE VIEW grafana.endpoint_request
90
+ AS SELECT endpoint_request.id,
91
+ endpoint_request.session_id,
92
+ endpoint_request.endpoint_name,
93
+ endpoint_request.http_code,
94
+ endpoint_request.message,
95
+ endpoint_request.created_at
96
+ FROM user_related.endpoint_request;
97
+ """
98
+ )
99
+ op.execute(
100
+ """
101
+ CREATE OR REPLACE VIEW grafana.inferred_user
102
+ AS SELECT inferred_user.id,
103
+ inferred_user.created_at
104
+ FROM user_related.inferred_user;
105
+ """
106
+ )
107
+ op.execute(
108
+ """
109
+ CREATE OR REPLACE VIEW grafana.process_state
110
+ AS SELECT process_state.id,
111
+ process_state.document_id,
112
+ process_state.title,
113
+ process_state.created_at,
114
+ process_state.operation_order
115
+ FROM document_related.process_state;
116
+ """
117
+ )
118
+ op.execute(
119
+ """
120
+ CREATE OR REPLACE VIEW grafana.qty_endpoints_per_user
121
+ AS SELECT iu.*::user_related.inferred_user AS iu,
122
+ count(1) AS count
123
+ FROM user_related.endpoint_request er
124
+ JOIN user_related.session s ON s.id = er.session_id
125
+ JOIN user_related.inferred_user iu ON iu.id = s.inferred_user_id
126
+ GROUP BY iu.id;
127
+ """
128
+ )
129
+ op.execute(
130
+ """
131
+ CREATE OR REPLACE VIEW grafana.qty_session_endpoint_per_user
132
+ AS SELECT s.inferred_user_id,
133
+ s.host,
134
+ count(DISTINCT s.id) AS count_sessions,
135
+ count(er.id) AS count_endpoints
136
+ FROM user_related.session s
137
+ LEFT JOIN user_related.endpoint_request er ON s.id = er.session_id
138
+ GROUP BY s.inferred_user_id, s.host;
139
+ """
140
+ )
141
+ op.execute(
142
+ """
143
+ CREATE OR REPLACE VIEW grafana.qty_session_user_per_host
144
+ AS SELECT s.host,
145
+ count(1) AS count_sessions,
146
+ count(DISTINCT s.inferred_user_id) AS count_users
147
+ FROM user_related.session s
148
+ GROUP BY s.host;
149
+ """
150
+ )
151
+ op.execute(
152
+ """
153
+ CREATE OR REPLACE VIEW grafana."session"
154
+ AS SELECT session.id,
155
+ session.inferred_user_id,
156
+ session.created_at,
157
+ session.end_at,
158
+ session.host
159
+ FROM user_related.session;
160
+ """
161
+ )
162
+ op.execute(
163
+ """
164
+ CREATE OR REPLACE VIEW grafana.document_latest_state
165
+ AS SELECT DISTINCT ON (ps.document_id) ps.id,
166
+ ps.document_id,
167
+ wd.corpus_id,
168
+ wd.lang,
169
+ ps.title,
170
+ ps.created_at,
171
+ ps.operation_order
172
+ FROM document_related.process_state ps
173
+ JOIN document_related.welearn_document wd ON ps.document_id = wd.id
174
+ ORDER BY ps.document_id, ps.operation_order DESC;
175
+ """
176
+ )
177
+
178
+
179
+ def downgrade():
180
+ op.execute("DROP VIEW IF EXISTS grafana.document_latest_state;")
181
+ op.execute("DROP VIEW IF EXISTS grafana.session;")
182
+ op.execute("DROP VIEW IF EXISTS grafana.qty_session_user_per_host;")
183
+ op.execute("DROP VIEW IF EXISTS grafana.qty_session_endpoint_per_user;")
184
+ op.execute("DROP VIEW IF EXISTS grafana.qty_endpoints_per_user;")
185
+ op.execute("DROP VIEW IF EXISTS grafana.process_state;")
186
+ op.execute("DROP VIEW IF EXISTS grafana.inferred_user;")
187
+ op.execute("DROP VIEW IF EXISTS grafana.endpoint_request;")
188
+ op.execute("DROP VIEW IF EXISTS grafana.document_state_summary;")
189
+ op.execute("DROP VIEW IF EXISTS grafana.corpus;")
@@ -0,0 +1,51 @@
1
+ """Added API Key Management table
2
+
3
+ Revision ID: 4fcbfb7f3145
4
+ Revises: 821173cf9c5d
5
+ Create Date: 2025-03-18 16:31:23.532135
6
+
7
+ """
8
+
9
+ from typing import Sequence, Union
10
+
11
+ import sqlalchemy
12
+ import sqlalchemy as sa
13
+ from sqlalchemy.dialects import postgresql
14
+
15
+ from alembic import op
16
+
17
+ # revision identifiers, used by Alembic.
18
+ revision: str = "4fcbfb7f3145"
19
+ down_revision: Union[str, None] = "821173cf9c5d"
20
+ branch_labels: Union[str, Sequence[str], None] = None
21
+ depends_on: Union[str, Sequence[str], None] = None
22
+
23
+
24
+ def upgrade() -> None:
25
+ # ### commands auto generated by Alembic - please adjust! ###
26
+ op.create_table(
27
+ "api_key_management",
28
+ sa.Column(
29
+ "id",
30
+ sa.Uuid(),
31
+ server_default=sqlalchemy.func.gen_random_uuid(),
32
+ nullable=False,
33
+ ),
34
+ sa.Column("title", sa.String(), nullable=True),
35
+ sa.Column("is_active", sa.Boolean(), nullable=False),
36
+ sa.Column("register_email", sa.String(), nullable=False),
37
+ sa.Column("digest", sa.LargeBinary(), nullable=False),
38
+ sa.Column(
39
+ "created_at", postgresql.TIMESTAMP(), server_default="NOW()", nullable=False
40
+ ),
41
+ sa.Column(
42
+ "updated_at", postgresql.TIMESTAMP(), server_default="NOW()", nullable=False
43
+ ),
44
+ sa.PrimaryKeyConstraint("id"),
45
+ schema="user_related",
46
+ )
47
+
48
+
49
+ def downgrade() -> None:
50
+ op.drop_table("api_key_management", schema="user_related")
51
+ # ### end Alembic commands ###
@@ -0,0 +1,67 @@
1
+ """context_document
2
+
3
+ Revision ID: 5d82613c9aca
4
+ Revises: e354666f951d
5
+ Create Date: 2025-10-08 12:14:40.215929
6
+
7
+ """
8
+
9
+ from typing import Sequence, Union
10
+
11
+ import sqlalchemy as sa
12
+ from sqlalchemy.dialects import postgresql
13
+
14
+ from alembic import op
15
+
16
+ # revision identifiers, used by Alembic.
17
+ revision: str = "5d82613c9aca"
18
+ down_revision: Union[str, None] = "e354666f951d"
19
+ branch_labels: Union[str, Sequence[str], None] = None
20
+ depends_on: Union[str, Sequence[str], None] = None
21
+
22
+
23
+ def upgrade() -> None:
24
+ # ### commands auto generated by Alembic - please adjust! ###
25
+ op.create_table(
26
+ "context_document",
27
+ sa.Column(
28
+ "id",
29
+ sa.Uuid(),
30
+ server_default=sa.func.gen_random_uuid(),
31
+ nullable=False,
32
+ ),
33
+ sa.Column("embedding_model_id", sa.Uuid(), nullable=False),
34
+ sa.Column("url", sa.String()),
35
+ sa.Column("full_content", sa.String()),
36
+ sa.Column("title", sa.String(), nullable=False),
37
+ sa.Column("sdg_related", sa.ARRAY(sa.INTEGER()), nullable=False),
38
+ sa.Column(
39
+ "created_at", postgresql.TIMESTAMP(), server_default="NOW()", nullable=False
40
+ ),
41
+ sa.Column("embedding", sa.LargeBinary(), nullable=True),
42
+ sa.Column(
43
+ "updated_at", postgresql.TIMESTAMP(), server_default="NOW()", nullable=False
44
+ ),
45
+ sa.Column(
46
+ "context_type",
47
+ postgresql.ENUM(
48
+ "introduction",
49
+ "target",
50
+ "subject",
51
+ name="context_type",
52
+ schema="document_related",
53
+ ),
54
+ nullable=False,
55
+ ),
56
+ sa.ForeignKeyConstraint(
57
+ ["embedding_model_id"],
58
+ ["corpus_related.embedding_model.id"],
59
+ ),
60
+ sa.PrimaryKeyConstraint("id"),
61
+ schema="document_related",
62
+ )
63
+
64
+
65
+ def downgrade() -> None:
66
+ op.drop_table("context_document", schema="document_related")
67
+ # ### end Alembic commands ###