PyPI - openedx-learning - Versions diffs - 0.29.0__py2.py3-none-any.whl → 0.30.0__py2.py3-none-any.whl - Mend

openedx-learning 0.29.0py2.py3-none-any.whl → 0.30.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

openedx_learning/apps/authoring/publishing/api.py CHANGED Viewed

@@ -13,9 +13,11 @@ from enum import Enum
 from typing import ContextManager, Optional, TypeVar
 from django.core.exceptions import ObjectDoesNotExist, ValidationError
-from django.db.models import F, Q, QuerySet
+from django.db.models import F, Prefetch, Q, QuerySet
 from django.db.transaction import atomic
+from openedx_learning.lib.fields import create_hash_digest
 from .contextmanagers import DraftChangeLogContext
 from .models import (
     Container,
@@ -31,9 +33,11 @@ from .models import (
     PublishableEntity,
     PublishableEntityMixin,
     PublishableEntityVersion,
+    PublishableEntityVersionDependency,
     PublishableEntityVersionMixin,
     PublishLog,
     PublishLogRecord,
+    PublishSideEffect,
 )
 from .models.publish_log import Published
@@ -213,6 +217,8 @@ def create_publishable_entity_version(
     title: str,
     created: datetime,
     created_by: int | None,
+    *,
+    dependencies: list[int] | None = None,  # PublishableEntity IDs
 ) -> PublishableEntityVersion:
     """
     Create a PublishableEntityVersion.
@@ -220,7 +226,7 @@ def create_publishable_entity_version(
     You'd typically want to call this right before creating your own content
     version model that points to it.
     """
-    with atomic():
+    with atomic(savepoint=False):
         version = PublishableEntityVersion.objects.create(
             entity_id=entity_id,
             version_num=version_num,
@@ -228,16 +234,82 @@ def create_publishable_entity_version(
             created=created,
             created_by_id=created_by,
         )
+        if dependencies:
+            set_version_dependencies(version.id, dependencies)
         set_draft_version(
             entity_id,
             version.id,
             set_at=created,
             set_by=created_by,
         )
     return version
+def set_version_dependencies(
+    version_id: int,  # PublishableEntityVersion.id,
+    /,
+    dependencies: list[int]  # List of PublishableEntity.id
+) -> None:
+    """
+    Set the dependencies of a publishable entity version.
+    In general, callers should not modify dependencies after creation (i.e. use
+    the optional param in create_publishable_entity_version() instead of using
+    this function). **This function is not atomic.** If you're doing backfills,
+    you must wrap calls to this function with a transaction.atomic() call.
+    The idea behind dependencies is that a PublishableEntity's Versions may
+    be declared to reference unpinned PublishableEntities. Changes to those
+    referenced PublishableEntities still affect the draft or published state of
+    the referent PublishableEntity, even if the referent entity's version is not
+    incremented.
+    For example, we only create a new UnitVersion when there are changes to the
+    metadata of the Unit itself. So this would happen when the name of the Unit
+    is changed, or if a child Component is added, removed, or reordered. No new
+    UnitVersion is created when a child Component of that Unit is modified or
+    published. Yet we still consider a Unit to be affected when one of its child
+    Components is modified or published. Therefore, we say that the child
+    Components are dependencies of the UnitVersion.
+    Dependencies vs. Container Rows/Children
+    Dependencies overlap with the concept of container child rows, but the two
+    are not exactly the same. For instance:
+    * Dependencies have no sense of ordering.
+    * If a row is declared to be pinned to a specific version of a child, then
+      it is NOT a dependency. For example, if U1.v1 is declared to have a pinned
+      reference to ComponentVersion C1.v1, then future changes to C1 do not
+      affect U1.v1 because U1.v1 will just ignore those new ComponentVersions.
+    Guidelines:
+    1. Only declare one level of dependencies, e.g. immediate parent-child
+       relationships. The publishing app will calculate transitive dependencies
+       like "all descendants" based on this. This is important for saving space,
+       because the DAG of trasitive dependency relationships can explode out to
+       tens of thousands of nodes per version.
+    2. Declare dependencies from the bottom-up when possible. In other words, if
+       you're building an entire Subsection, set the Component dependencies for
+       the Units before you set the Unit dependencies for the Subsection. This
+       code will still work if you build from the top-down, but we'll end up
+       doing many redundant re-calculations, since every change to a lower layer
+       will cause recalculation to the higher levels that depend on it.
+    3. Do not create circular dependencies.
+    """
+    PublishableEntityVersionDependency.objects.bulk_create(
+        [
+            PublishableEntityVersionDependency(
+                referring_version_id=version_id,
+                referenced_entity_id=dep_entity_id,
+            )
+            for dep_entity_id in set(dependencies)  # dependencies have no ordering
+        ],
+    )
 def get_publishable_entity(publishable_entity_id: int, /) -> PublishableEntity:
     return PublishableEntity.objects.get(id=publishable_entity_id)
@@ -285,7 +357,8 @@ def get_entities_with_unpublished_changes(
     """
     Fetch entities that have unpublished changes.
-    By default, this excludes soft-deleted drafts but can be included using include_deleted_drafts option.
+    By default, this excludes soft-deleted drafts but can be included using
+    include_deleted_drafts option.
     """
     entities_qs = (
         PublishableEntity.objects
@@ -333,65 +406,72 @@ def publish_all_drafts(
     Publish everything that is a Draft and is not already published.
     """
     draft_qset = (
-        Draft.objects.select_related("entity__published")
-        .filter(entity__learning_package_id=learning_package_id)
-        # Exclude entities where the Published version already matches the
-        # Draft version.
-        .exclude(entity__published__version_id=F("version_id"))
-        # Account for soft-deletes:
-        # NULL != NULL in SQL, so simply excluding entities where the Draft
-        # and Published versions match will not catch the case where a
-        # soft-delete has been published (i.e. both the Draft and Published
-        # versions are NULL). We need to explicitly check for that case
-        # instead, or else we will re-publish the same soft-deletes over
-        # and over again.
-        .exclude(Q(version__isnull=True) & Q(entity__published__version__isnull=True))
+        Draft.objects
+             .filter(entity__learning_package_id=learning_package_id)
+             .with_unpublished_changes()
     )
     return publish_from_drafts(
         learning_package_id, draft_qset, message, published_at, published_by
     )
+def _get_dependencies_with_unpublished_changes(
+    draft_qset: QuerySet[Draft]
+) -> list[QuerySet[Draft]]:
+    """
+    Return all dependencies to publish as a list of Draft QuerySets.
+    This should only return the Drafts that have actual changes, not pure side-
+    effects. The side-effect calculations will happen separately.
+    """
+    # First we have to do a full crawl of *all* dependencies, regardless of
+    # whether they have unpublished changes or not. This is because we might
+    # have a dependency-of-a-dependency that has changed somewhere down the
+    # line. Example: The draft_qset includes a Subsection. Even if the Unit
+    # versions are still the same, there might be a changed Component that needs
+    # to be published.
+    all_dependency_drafts = []
+    dependency_drafts = Draft.objects.filter(
+        entity__affects__in=draft_qset.values_list("version_id", flat=True)
+    ).distinct()
+    while dependency_drafts:
+        all_dependency_drafts.append(dependency_drafts)
+        dependency_drafts = Draft.objects.filter(
+            entity__affects__in=dependency_drafts.all().values_list("version_id", flat=True)
+        ).distinct()
+    unpublished_dependency_drafts = [
+        dependency_drafts_qset.all().with_unpublished_changes()
+        for dependency_drafts_qset in all_dependency_drafts
+    ]
+    return unpublished_dependency_drafts
 def publish_from_drafts(
     learning_package_id: int,  # LearningPackage.id
     /,
-    draft_qset: QuerySet,
+    draft_qset: QuerySet[Draft],
     message: str = "",
     published_at: datetime | None = None,
     published_by: int | None = None,  # User.id
+    *,
+    publish_dependencies: bool = True,
 ) -> PublishLog:
     """
     Publish the rows in the ``draft_model_qsets`` args passed in.
+    By default, this will also publish all dependencies (e.g. unpinned children)
+    of the Drafts that are passed in.
     """
     if published_at is None:
         published_at = datetime.now(tz=timezone.utc)
     with atomic():
-        # If the drafts include any containers, we need to auto-publish their descendants:
-        # TODO: this only handles one level deep and would need to be updated to support sections > subsections > units
-        # Get the IDs of the ContainerVersion for any Containers whose drafts are slated to be published.
-        container_version_ids = (
-            Container.objects.filter(publishable_entity__draft__in=draft_qset)
-            .values_list("publishable_entity__draft__version__containerversion__pk", flat=True)
-        )
-        if container_version_ids:
-            # We are publishing at least one container. Check if it has any child components that aren't already slated
-            # to be published.
-            unpublished_draft_children = EntityListRow.objects.filter(
-                entity_list__container_versions__pk__in=container_version_ids,
-                entity_version=None,  # Unpinned entities only
-            ).exclude(
-                entity__draft__version=F("entity__published__version")  # Exclude already published things
-            ).values_list("entity__draft__pk", flat=True)
-            if unpublished_draft_children:
-                # Force these additional child components to be published at the same time by adding them to the qset:
-                draft_qset = Draft.objects.filter(
-                    Q(pk__in=draft_qset.values_list("pk", flat=True)) |
-                    Q(pk__in=unpublished_draft_children)
-                )
+        if publish_dependencies:
+            dependency_drafts_qsets = _get_dependencies_with_unpublished_changes(draft_qset)
+        else:
+            dependency_drafts_qsets = []
         # One PublishLog for this entire publish operation.
         publish_log = PublishLog(
@@ -403,32 +483,51 @@ def publish_from_drafts(
         publish_log.full_clean()
         publish_log.save(force_insert=True)
-        for draft in draft_qset.select_related("entity__published__version"):
-            try:
-                old_version = draft.entity.published.version
-            except ObjectDoesNotExist:
-                # This means there is no published version yet.
-                old_version = None
-            # Create a record describing publishing this particular Publishable
-            # (useful for auditing and reverting).
-            publish_log_record = PublishLogRecord(
-                publish_log=publish_log,
-                entity=draft.entity,
-                old_version=old_version,
-                new_version=draft.version,
-            )
-            publish_log_record.full_clean()
-            publish_log_record.save(force_insert=True)
+        # We're intentionally avoiding union() here because Django ORM unions
+        # introduce cumbersome restrictions (can only union once, can't
+        # select_related on it after, the extra rows must be exactly compatible
+        # in unioned qsets, etc.) Instead, we're going to have one queryset per
+        # dependency layer.
+        all_draft_qsets = [
+            draft_qset.select_related("entity__published__version"),
+            *dependency_drafts_qsets,  # one QuerySet per layer of dependencies
+        ]
+        published_draft_ids = set()
+        for qset in all_draft_qsets:
+            for draft in qset:
+                # Skip duplicates that we might get from expanding dependencies.
+                if draft.pk in published_draft_ids:
+                    continue
+                try:
+                    old_version = draft.entity.published.version
+                except ObjectDoesNotExist:
+                    # This means there is no published version yet.
+                    old_version = None
+                # Create a record describing publishing this particular
+                # Publishable (useful for auditing and reverting).
+                publish_log_record = PublishLogRecord(
+                    publish_log=publish_log,
+                    entity=draft.entity,
+                    old_version=old_version,
+                    new_version=draft.version,
+                )
+                publish_log_record.full_clean()
+                publish_log_record.save(force_insert=True)
+                # Update the lookup we use to fetch the published versions
+                Published.objects.update_or_create(
+                    entity=draft.entity,
+                    defaults={
+                        "version": draft.version,
+                        "publish_log_record": publish_log_record,
+                    },
+                )
-            # Update the lookup we use to fetch the published versions
-            Published.objects.update_or_create(
-                entity=draft.entity,
-                defaults={
-                    "version": draft.version,
-                    "publish_log_record": publish_log_record,
-                },
-            )
+                published_draft_ids.add(draft.pk)
+        _create_side_effects_for_change_log(publish_log)
     return publish_log
@@ -505,8 +604,8 @@ def set_draft_version(
     from Studio's editing point of view (see ``soft_delete_draft`` for more
     details).
-    Calling this function attaches a new DraftChangeLogRecordand attaches it to a
-    DraftChangeLog.
+    Calling this function attaches a new DraftChangeLogRecord and attaches it to
+    a DraftChangeLog.
     This function will create DraftSideEffect entries and properly add any
     containers that may have been affected by this draft update, UNLESS it is
@@ -538,7 +637,6 @@ def set_draft_version(
         # The actual update of the Draft model is here. Everything after this
         # block is bookkeeping in our DraftChangeLog.
         draft.version_id = publishable_entity_version_pk
-        draft.save()
         # Check to see if we're inside a context manager for an active
         # DraftChangeLog (i.e. what happens if the caller is using the public
@@ -548,23 +646,52 @@ def set_draft_version(
             learning_package_id
         )
         if active_change_log:
-            change = _add_to_existing_draft_change_log(
+            draft_log_record = _add_to_existing_draft_change_log(
                 active_change_log,
                 draft.entity_id,
                 old_version_id=old_version_id,
                 new_version_id=publishable_entity_version_pk,
             )
-            # We explicitly *don't* create container side effects here because
-            # there may be many changes in this DraftChangeLog, some of which
-            # haven't been made yet. It wouldn't make sense to create a side
-            # effect that says, "this Unit changed because this Component in it
-            # changed" if we were changing that same Unit later on in the same
+            if draft_log_record:
+                # Normal case: a DraftChangeLogRecord was created or updated.
+                draft.draft_log_record = draft_log_record
+            else:
+                # Edge case: this change cancelled out other changes, and the
+                # net effect is that the DraftChangeLogRecord shouldn't exist,
+                # i.e. the version at the start and end of the DraftChangeLog is
+                # the same. In that case, _add_to_existing_draft_change_log will
+                # delete the log record for this Draft state, and we have to
+                # look for the most recently created DraftLogRecord from another
+                # DraftChangeLog. This value may be None.
+                #
+                # NOTE: There may be some weird edge cases here around soft-
+                # deletions and modifications of the same Draft entry in nested
+                # bulk_draft_changes_for() calls. I haven't thought that through
+                # yet--it might be better to just throw an error rather than try
+                # to accommodate it.
+                draft.draft_log_record = (
+                    DraftChangeLogRecord.objects
+                                        .filter(entity_id=draft.entity_id)
+                                        .order_by("-pk")
+                                        .first()
+                )
+            draft.save()
+            # We also *don't* create container side effects here because there
+            # may be many changes in this DraftChangeLog, some of which haven't
+            # been made yet. It wouldn't make sense to create a side effect that
+            # says, "this Unit changed because this Component in it changed" if
+            # we were changing that same Unit later on in the same
             # DraftChangeLog, because that new Unit version might not even
-            # include the child Component. So we'll let DraftChangeLogContext
-            # do that work when it exits its context.
+            # include that child Component. Also, calculating side-effects is
+            # expensive, and would result in a lot of wasted queries if we did
+            # it for every change will inside an active change log context.
+            #
+            # Therefore we'll let DraftChangeLogContext do that work when it
+            # exits its context.
         else:
             # This means there is no active DraftChangeLog, so we create our own
-            # and add our DraftChangeLogRecord to it. This has the minor
+            # and add our DraftChangeLogRecord to it. This has the very minor
             # optimization that we don't have to check for an existing
             # DraftChangeLogRecord, because we know it can't exist yet.
             change_log = DraftChangeLog.objects.create(
@@ -572,13 +699,14 @@ def set_draft_version(
                 changed_at=set_at,
                 changed_by_id=set_by,
             )
-            change = DraftChangeLogRecord.objects.create(
+            draft.draft_log_record = DraftChangeLogRecord.objects.create(
                 draft_change_log=change_log,
                 entity_id=draft.entity_id,
                 old_version_id=old_version_id,
                 new_version_id=publishable_entity_version_pk,
             )
-            _create_container_side_effects_for_draft_change(change)
+            draft.save()
+            _create_side_effects_for_change_log(change_log)
 def _add_to_existing_draft_change_log(
@@ -586,9 +714,9 @@ def _add_to_existing_draft_change_log(
     entity_id: int,
     old_version_id: int | None,
     new_version_id: int | None,
-) -> DraftChangeLogRecord:
+) -> DraftChangeLogRecord | None:
     """
-    Create or update a DraftChangeLogRecord for the active_change_log passed in.
+    Create, update, or delete a DraftChangeLogRecord for the active_change_log.
     The an active_change_log may have many DraftChangeLogRecords already
     associated with it. A DraftChangeLog can only have one DraftChangeLogRecord
@@ -602,6 +730,13 @@ def _add_to_existing_draft_change_log(
     and the same entity_id but with versions: (None, v1), (v1, v2), (v2, v3);
     we would collapse them into one DraftChangeLogrecord with old_version = None
     and new_version = v3.
+    This also means that if we make a change that undoes the previos change, it
+    will delete that DraftChangeLogRecord, e.g. (None, v1) -> (None, v2) ->
+    (None -> None), then this entry can be deleted because it didn't change
+    anything. This function should never be used for creating side-effect change
+    log records (the only place where it's normal to have the same old and new
+    versions).
     """
     try:
         # Check to see if this PublishableEntity has already been changed in
@@ -622,8 +757,11 @@ def _add_to_existing_draft_change_log(
             #
             # It's important that we remove these cases, because we use the
             # old_version == new_version convention to record entities that have
-            # changed purely due to side-effects.
+            # changed purely due to side-effects. We could technically still
+            # differentiate those by actually looking at the DraftSideEffect and
+            # PublishSideEffect models, but this is less confusing overall.
             change.delete()
+            return None
         else:
             # Normal case: We update the new_version, but leave the old_version
             # as is. The old_version represents what the Draft was pointing to
@@ -644,87 +782,418 @@ def _add_to_existing_draft_change_log(
     return change
-def _create_container_side_effects_for_draft_change_log(change_log: DraftChangeLog):
-    """
-    Iterate through the whole DraftChangeLog and process side-effects.
-    """
+def _create_side_effects_for_change_log(change_log: DraftChangeLog | PublishLog):
+    """
+    Create the side-effects for a DraftChangeLog or PublishLog.
+    A side-effect is created whenever a dependency of a draft or published
+    entity version is altered.
+    For example, say we have a published Unit at version 1 (``U1.v1``).
+    ``U1.v1`` is defined to have unpinned references to Components ``C1`` and
+    ``C2``, i.e. ``U1.v1 = [C1, C2]``. This means that ``U1.v1`` always shows
+    the latest published versions of ``C1`` and ``C2``. While we do have a more
+    sophisticated encoding for the ordered parent-child relationships, we
+    capture the basic dependency relationship using the M:M
+    ``PublishableEntityVersionDependency`` model. In this scenario, C1 and C2
+    are dependencies of ``U1.v1``
+    In the above example, publishing a newer version of ``C1`` does *not*
+    increment the version for Unit ``U1``. But we still want to record that
+    ``U1.v1`` was affected by the change in ``C1``. We record this in the
+    ``DraftSideEffect`` and ``PublishSideEffect`` models. We also add an entry
+    for ``U1`` in the change log, saying that it went from version 1 to version
+    1--i.e nothing about the Unit's defintion changed, but it was still affected
+    by other changes in the log.
+    Only call this function after all the records have already been created.
+    Note: The interface between ``DraftChangeLog`` and ``PublishLog`` is similar
+    enough that this function has been made to work with both.
+    """
+    branch_cls: type[Draft] | type[Published]
+    change_record_cls: type[DraftChangeLogRecord] | type[PublishLogRecord]
+    side_effect_cls: type[DraftSideEffect] | type[PublishSideEffect]
+    if isinstance(change_log, DraftChangeLog):
+        branch_cls = Draft
+        change_record_cls = DraftChangeLogRecord
+        side_effect_cls = DraftSideEffect
+        log_record_rel = "draft_log_record_id"
+    elif isinstance(change_log, PublishLog):
+        branch_cls = Published
+        change_record_cls = PublishLogRecord
+        side_effect_cls = PublishSideEffect
+        log_record_rel = "publish_log_record_id"
+    else:
+        raise TypeError(
+            f"expected DraftChangeLog or PublishLog, not {type(change_log)}"
+        )
+    # processed_entity_ids holds the entity IDs that we've already calculated
+    # side-effects for. This is to save us from recalculating side-effects for
+    # the same dependency relationships over and over again. So if we're calling
+    # this function in a loop for all the Components in a Unit, we won't be
+    # recalculating the Unit's side-effect on its Subsection, and its
+    # Subsection's side-effect on its Section each time through the loop.
+    # It also guards against infinite parent-child relationship loops, though
+    # those aren't *supposed* to happen anyhow.
     processed_entity_ids: set[int] = set()
-    for change in change_log.records.all():
-        _create_container_side_effects_for_draft_change(
-            change,
-            processed_entity_ids=processed_entity_ids,
+    for original_change in change_log.records.all():
+        affected_by_original_change = branch_cls.objects.filter(
+            version__dependencies=original_change.entity
         )
+        changes_and_affected = [
+            (original_change, current) for current in affected_by_original_change
+        ]
+        # These are the Published or Draft objects where we need to repoint the
+        # log_record (publish_log_record or draft_change_log_record) to point to
+        # the side-effect changes, e.g. the DraftChangeLogRecord that says,
+        # "This Unit's version stayed the same, but its dependency hash changed
+        # because a child Component's draft version was changed." We gather them
+        # all up in a list so we can do a bulk_update on them.
+        branch_objs_to_update_with_side_effects = []
+        while changes_and_affected:
+            change, affected = changes_and_affected.pop()
+            change_log_param = {}
+            if branch_cls == Draft:
+                change_log_param['draft_change_log'] = change.draft_change_log  # type: ignore[union-attr]
+            elif branch_cls == Published:
+                change_log_param['publish_log'] = change.publish_log  # type: ignore[union-attr]
+            # Example: If the original_change is a DraftChangeLogRecord that
+            # represents editing a Component, the side_effect_change is the
+            # DraftChangeLogRecord that represents the fact that the containing
+            # Unit was also altered (even if the Unit version doesn't change).
+            side_effect_change, _created = change_record_cls.objects.get_or_create(
+                **change_log_param,
+                entity_id=affected.entity_id,
+                defaults={
+                    # If a change record already exists because the affected
+                    # entity was separately modified, then we don't touch the
+                    # old/new version entries. But if we're creating this change
+                    # record as a pure side-effect, then we use the (old_version
+                    # == new_version) convention to indicate that.
+                    'old_version_id': affected.version_id,
+                    'new_version_id': affected.version_id,
+                }
+            )
+            # Update the current branch pointer (Draft or Published) for this
+            # entity to point to the side_effect_change (if it's not already).
+            if branch_cls == Published:
+                published_obj = affected.entity.published
+                if published_obj.publish_log_record != side_effect_change:
+                    published_obj.publish_log_record = side_effect_change
+                    branch_objs_to_update_with_side_effects.append(published_obj)
+            elif branch_cls == Draft:
+                draft_obj = affected.entity.draft
+                if draft_obj.draft_log_record != side_effect_change:
+                    draft_obj.draft_log_record = side_effect_change
+                    branch_objs_to_update_with_side_effects.append(draft_obj)
+            # Create a new side effect (DraftSideEffect or PublishSideEffect) to
+            # record the relationship between the ``change`` and the
+            # corresponding ``side_effect_change``. We'll do this regardless of
+            # whether we created the ``side_effect_change`` or just pulled back
+            # an existing one. This addresses two things:
+            #
+            # 1. A change in multiple dependencies can generate multiple
+            #    side effects that point to the same change log record, i.e.
+            #    multiple changes can cause the same ``effect``.
+            #    Example: Two draft components in a Unit are changed. Two
+            #    DraftSideEffects will be created and point to the same Unit
+            #    DraftChangeLogRecord.
+            # 2. A entity and its dependency can change at the same time.
+            #    Example: If a Unit has a Component, and both the Unit and
+            #    Component are edited in the same DraftChangeLog, then the Unit
+            #    has changed in both ways (the Unit's internal metadata as well
+            #    as the new version of the child component). The version of the
+            #    Unit will be incremented, but we'll also create the
+            #    DraftSideEffect.
+            side_effect_cls.objects.get_or_create(
+                cause=change,
+                effect=side_effect_change,
+            )
-def _create_container_side_effects_for_draft_change(
-    original_change: DraftChangeLogRecord,
-    processed_entity_ids: set | None = None
-):
-    """
-    Given a draft change, add side effects for all affected containers.
+            # Now we find the next layer up by looking at Drafts or Published
+            # that have ``affected.entity`` as a dependency.
+            next_layer_of_affected = branch_cls.objects.filter(
+                version__dependencies=affected.entity
+            )
+            # Make sure we never re-add the change we just processed when we
+            # queue up the next layer.
+            processed_entity_ids.add(change.entity_id)
+            changes_and_affected.extend(
+                (side_effect_change, affected)
+                for affected in next_layer_of_affected
+                if affected.entity_id not in processed_entity_ids
+            )
+        branch_cls.objects.bulk_update(
+            branch_objs_to_update_with_side_effects,  # type: ignore[arg-type]
+            [log_record_rel],
+        )
-    This should only be run after the DraftChangeLogRecord has been otherwise
-    fully written out. We want to avoid the scenario where we create a
-    side-effect that a Component change affects a Unit if the Unit version is
-    also changed (maybe even deleted) in the same DraftChangeLog.
+    update_dependencies_hash_digests_for_log(change_log)
-    The `processed_entity_ids` set holds the entity IDs that we've already
-    calculated side-effects for. This is to save us from recalculating side-
-    effects for the same ancestor relationships over and over again. So if we're
-    calling this function in a loop for all the Components in a Unit, we won't
-    be recalculating the Unit's side-effect on its Subsection, and its
-    Subsection's side-effect on its Section.
-    TODO: This could get very expensive with the get_containers_with_entity
-    calls. We should measure the impact of this.
+def update_dependencies_hash_digests_for_log(
+    change_log: DraftChangeLog | PublishLog,
+    backfill: bool = False,
+) -> None:
     """
-    if processed_entity_ids is None:
-        # An optimization, but also a guard against infinite side-effect loops.
-        processed_entity_ids = set()
+    Update dependencies_hash_digest for Drafts or Published in a change log.
-    changes_and_containers = [
-        (original_change, container)
-        for container
-        in get_containers_with_entity(original_change.entity_id, ignore_pinned=True)
-    ]
-    while changes_and_containers:
-        change, container = changes_and_containers.pop()
-        # If the container is not already in the DraftChangeLog, we need to
-        # add it. Since it's being caused as a DraftSideEffect, we're going
-        # add it with the old_version == new_version convention.
-        container_draft_version_pk = container.versioning.draft.pk
-        container_change, _created = DraftChangeLogRecord.objects.get_or_create(
-            draft_change_log=change.draft_change_log,
-            entity_id=container.pk,
-            defaults={
-                'old_version_id': container_draft_version_pk,
-                'new_version_id': container_draft_version_pk
-            }
+    All the data for Draft/Published, DraftChangeLog/PublishLog, and
+    DraftChangeLogRecord/PublishLogRecord have been set at this point *except*
+    the dependencies_hash_digest of DraftChangeLogRecord/PublishLogRecord. Those
+    log records are newly created at this point, so dependencies_hash_digest are
+    set to their default values.
+    Args:
+        change_log: A DraftChangeLog or PublishLog that already has all
+            side-effects added to it. The Draft and Published models should
+            already be updated to point to the post-change versions.
+        backfill: If this is true, we will not trust the hash values stored on
+            log records outside of our log, i.e. things that we would normally
+            expect to be pre-calculated. This will be important for the initial
+            data migration.
+    """
+    if isinstance(change_log, DraftChangeLog):
+        branch = "draft"
+        log_record_relation = "draft_log_record"
+        record_cls = DraftChangeLogRecord
+    elif isinstance(change_log, PublishLog):
+        branch = "published"
+        log_record_relation = "publish_log_record"
+        record_cls = PublishLogRecord  # type: ignore[assignment]
+    else:
+        raise TypeError(
+            f"expected DraftChangeLog or PublishLog, not {type(change_log)}"
         )
-        # Mark that change in the current loop has the side effect of changing
-        # the parent container. We'll do this regardless of whether the
-        # container version itself also changed. If a Unit has a Component and
-        # both the Unit and Component have their versions incremented, then the
-        # Unit has changed in both ways (the Unit's internal metadata as well as
-        # the new version of the child component).
-        DraftSideEffect.objects.get_or_create(cause=change, effect=container_change)
-        processed_entity_ids.add(change.entity_id)
-        # Now we find the next layer up of containers. So if the originally
-        # passed in publishable_entity_id was for a Component, then the
-        # ``container`` we've been creating the side effect for in this loop
-        # is the Unit, and ``parents_of_container`` would be any Sequences
-        # that contain the Unit.
-        parents_of_container = get_containers_with_entity(container.pk, ignore_pinned=True)
-        changes_and_containers.extend(
-            (container_change, container_parent)
-            for container_parent in parents_of_container
-            if container_parent.pk not in processed_entity_ids
+    dependencies_prefetch = Prefetch(
+        "new_version__dependencies",
+        queryset=PublishableEntity.objects
+                                  .select_related(
+                                      f"{branch}__version",
+                                      f"{branch}__{log_record_relation}",
+                                   )
+                                  .order_by(f"{branch}__version__uuid")
+    )
+    changed_records: QuerySet[DraftChangeLogRecord] | QuerySet[PublishLogRecord]
+    changed_records = (
+        change_log.records
+                  .select_related("new_version", f"entity__{branch}")
+                  .prefetch_related(dependencies_prefetch)
+    )
+    record_ids_to_hash_digests: dict[int, str | None] = {}
+    record_ids_to_live_deps: dict[int, list[PublishableEntity]] = {}
+    records_that_need_hashes = []
+    for record in changed_records:
+        # This is a soft-deletion, so the dependency hash is default/blank. We
+        # set this value in our record_ids_to_hash_digests cache, but we don't
+        # need to write it to the database because it's just the default value.
+        if record.new_version is None:
+            record_ids_to_hash_digests[record.id] = ''
+            continue
+        # Now check to see if the new version has "live" dependencies, i.e.
+        # dependencies that have not been deleted.
+        deps = list(
+            entity for entity in record.new_version.dependencies.all()
+            if hasattr(entity, branch) and getattr(entity, branch).version
         )
+        # If there are no live dependencies, this log record also gets the
+        # default/blank value.
+        if not deps:
+            record_ids_to_hash_digests[record.id] = ''
+            continue
+        # If we've gotten this far, it means that this record has dependencies
+        # and does need to get a hash computed for it.
+        records_that_need_hashes.append(record)
+        record_ids_to_live_deps[record.id] = deps
+    if backfill:
+        untrusted_record_id_set = None
+    else:
+        untrusted_record_id_set = set(rec.id for rec in records_that_need_hashes)
+    for record in records_that_need_hashes:
+        record.dependencies_hash_digest = hash_for_log_record(
+            record,
+            record_ids_to_hash_digests,
+            record_ids_to_live_deps,
+            untrusted_record_id_set,
+        )
+    _bulk_update_hashes(record_cls, records_that_need_hashes)
+def _bulk_update_hashes(model_cls, records):
+    """
+    bulk_update using the model class (PublishLogRecord or DraftChangeLogRecord)
+    The only reason this function exists is because mypy 1.18.2 throws an
+    exception in validate_bulk_update() during "make quality" checks otherwise
+    (though curiously enough, not when that same version of mypy is called
+    directly). Given that I'm writing this on the night before the Ulmo release
+    cut, I'm not really interested in tracking down the underlying issue.
+    The lack of type annotations on this function is very intentional.
+    """
+    model_cls.objects.bulk_update(records, ['dependencies_hash_digest'])
+def hash_for_log_record(
+    record: DraftChangeLogRecord | PublishLogRecord,
+    record_ids_to_hash_digests: dict,
+    record_ids_to_live_deps: dict,
+    untrusted_record_id_set: set | None,
+) -> str:
+    """
+    The hash digest for a given change log record.
+    Note that this code is a little convoluted because we're working hard to
+    minimize the number of database requests. All the data we really need could
+    be derived from querying various relations off the record that's passed in
+    as the first parameter, but at a far higher cost.
+    The hash calculated here will be used for the dependencies_hash_digest
+    attribute of DraftChangeLogRecord and PublishLogRecord. The hash is intended
+    to calculate the currently "live" (current draft or published) state of all
+    dependencies (and transitive dependencies) of the PublishableEntityVersion
+    pointed to by DraftChangeLogRecord.new_version/PublishLogRecord.new_version.
+    The common case we have at the moment is when a container type like a Unit
+    has unpinned child Components as dependencies. In the data model, those
+    dependency relationships are represented by the "dependencies" M:M relation
+    on PublishableEntityVersion. Since the Unit version's references to its
+    child Components are unpinned, the draft Unit is always pointing to the
+    latest draft versions of those Components and the published Unit is always
+    pointing to the latest published versions of those Components.
+    This means that the total draft or published state of any PublishableEntity
+    depends on the combination of:
+    1. The definition of the current draft/published version of that entity.
+       Example: Version 1 of a Unit would define that it had children [C1, C2].
+       Version 2 of the same Unit might have children [C1, C2, C3].
+    2. The current draft/published versions of all dependencies. Example: What
+       are the current draft and published versions of C1, C2, and C3.
+    This is why it makes sense to capture in a log record, since
+    PublishLogRecords or DraftChangeLogRecords are created whenever one of the
+    above two things changes.
+    Here are the possible scenarios, including edge cases:
+    EntityVersions with no dependencies
+      If record.new_version has no dependencies, dependencies_hash_digest is
+      set to the default value of ''. This will be the most common case.
+    EntityVersions with dependencies
+      If an EntityVersion has dependencies, then its draft/published state
+      hash is based on the concatenation of, for each non-deleted dependency:
+        (i)  the dependency's draft/published EntityVersion primary key, and
+        (ii) the dependency's own draft/published state hash, recursively re-
+             calculated if necessary.
+    Soft-deletions
+      If the record.new_version is None, that means we've just soft-deleted
+      something (or published the soft-delete of something). We adopt the
+      convention that if something is soft-deleted, its dependencies_hash_digest
+      is reset to the default value of ''. This is not strictly necessary for
+      the recursive hash calculation, but deleted entities will not have their
+      hash updated even as their non-deleted dependencies are updated underneath
+      them, so we set to '' to avoid falsely implying that the deleted entity's
+      dep hash is up to date.
+    EntityVersions with soft-deleted dependencies
+      A soft-deleted dependency isn't counted (it's as if the dependency were
+      removed). If all of an EntityVersion's dependencies are soft-deleted,
+      then it will go back to having to having the default blank string for its
+      dependencies_hash_digest.
+    """
+    # Case #1: We've already computed this, or it was bootstrapped for us in the
+    # cache because the record is a deletion or doesn't have dependencies.
+    if record.id in record_ids_to_hash_digests:
+        return record_ids_to_hash_digests[record.id]
+    # Case #2: The log_record is a dependency of something that was affected by
+    # a change, but the dependency itself did not change in any way (neither
+    # directly, nor as a side-effect).
+    #
+    # Example: A Unit has two Components. One of the Components changed, forcing
+    # us to recalculate the dependencies_hash_digest for that Unit. Doing that
+    # recalculation requires us to fetch the dependencies_hash_digest of the
+    # unchanged child Component as well.
+    #
+    # If we aren't given an explicit untrusted_record_id_set, it means we can't
+    # trust anything. This would happen when we're bootstrapping things with an
+    # initial data migration.
+    if (untrusted_record_id_set is not None) and (record.id not in untrusted_record_id_set):
+        return record.dependencies_hash_digest
+    # Normal recursive case starts here:
+    if isinstance(record, DraftChangeLogRecord):
+        branch = "draft"
+    elif isinstance(record, PublishLogRecord):
+        branch = "published"
+    else:
+        raise TypeError(
+            f"expected DraftChangeLogRecord or PublishLogRecord, not {type(record)}"
+        )
+    # This is extra work that only happens in case of a backfill, where we might
+    # need to compute dependency hashes for things outside of our log (because
+    # we don't trust them).
+    if record.id not in record_ids_to_live_deps:
+        if record.new_version is None:
+            record_ids_to_hash_digests[record.id] = ''
+            return ''
+        deps = list(
+            entity for entity in record.new_version.dependencies.all()
+            if hasattr(entity, branch) and getattr(entity, branch).version
+        )
+        # If there are no live dependencies, this log record also gets the
+        # default/blank value.
+        if not deps:
+            record_ids_to_hash_digests[record.id] = ''
+            return ''
+        record_ids_to_live_deps[record.id] = deps
+    # End special handling for backfill.
+    # Begin normal
+    dependencies: list[PublishableEntity] = sorted(
+        record_ids_to_live_deps[record.id],
+        key=lambda entity: getattr(entity, branch).log_record.new_version_id,
+    )
+    dep_state_entries = []
+    for dep_entity in dependencies:
+        new_version_id = getattr(dep_entity, branch).log_record.new_version_id
+        hash_digest = hash_for_log_record(
+            getattr(dep_entity, branch).log_record,
+            record_ids_to_hash_digests,
+            record_ids_to_live_deps,
+            untrusted_record_id_set,
+        )
+        dep_state_entries.append(f"{new_version_id}:{hash_digest}")
+    summary_text = "\n".join(dep_state_entries)
+    digest = create_hash_digest(summary_text.encode(), num_bytes=4)
+    record_ids_to_hash_digests[record.id] = digest
+    return digest
 def soft_delete_draft(publishable_entity_id: int, /, deleted_by: int | None = None) -> None:
     """
@@ -974,7 +1443,6 @@ def create_entity_list_with_rows(
                 raise ValidationError("Container entity versions must belong to the specified entity.")
     with atomic(savepoint=False):
         entity_list = create_entity_list()
         EntityListRow.objects.bulk_create(
             [
@@ -1012,6 +1480,11 @@ def _create_container_version(
             title=title,
             created=created,
             created_by=created_by,
+            dependencies=[
+                entity_row.entity_id
+                for entity_row in entity_list.rows
+                if entity_row.is_unpinned()
+            ]
         )
         container_version = container_version_cls.objects.create(
             publishable_entity_version=publishable_entity_version,
@@ -1378,6 +1851,11 @@ def contains_unpublished_changes(container_id: int) -> bool:
     [ 🛑 UNSTABLE ]
     Check recursively if a container has any unpublished changes.
+    Note: I've preserved the API signature for now, but we probably eventually
+    want to make a more general function that operates on PublishableEntities
+    and dependencies, once we introduce those with courses and their files,
+    grading policies, etc.
     Note: unlike this method, the similar-sounding
     `container.versioning.has_unpublished_changes` property only reports
     if the container itself has unpublished changes, not
@@ -1386,50 +1864,37 @@ def contains_unpublished_changes(container_id: int) -> bool:
     that's in the container, it will be `False`. This method will return `True`
     in either case.
     """
-    # This is similar to 'get_container(container.container_id)' but pre-loads more data.
-    container = Container.objects.select_related(
-        "publishable_entity__draft__version__containerversion__entity_list",
-    ).get(pk=container_id)
+    container = (
+        Container.objects
+                 .select_related('publishable_entity__draft__draft_log_record')
+                 .select_related('publishable_entity__published__publish_log_record')
+                 .get(pk=container_id)
+    )
     if container.versioning.has_unpublished_changes:
         return True
-    # We only care about children that are un-pinned, since published changes to pinned children don't matter
-    entity_list = getattr(container.versioning.draft, "entity_list", None)
-    if entity_list is None:
-        # This container has been soft-deleted, so it has no children.
+    draft = container.publishable_entity.draft
+    published = container.publishable_entity.published
+    # Edge case: A container that was created and then immediately soft-deleted
+    # does not contain any unpublished changes.
+    if draft is None and published is None:
         return False
-    # This is a naive and inefficient implementation but should be correct.
-    # TODO: Once we have expanded the containers system to support multiple levels (not just Units and Components but
-    # also subsections and sections) and we have an expanded test suite for correctness, then we can optimize.
-    # We will likely change to a tracking-based approach rather than a "scan for changes" based approach.
-    for row in entity_list.entitylistrow_set.filter(entity_version=None).select_related(
-        "entity__container",
-        "entity__draft__version",
-        "entity__published__version",
-    ):
-        try:
-            child_container = row.entity.container
-        except Container.DoesNotExist:
-            child_container = None
-        if child_container:
-            # This is itself a container - check recursively:
-            if contains_unpublished_changes(child_container.pk):
-                return True
-        else:
-            # This is not a container:
-            draft_pk = row.entity.draft.version_id if row.entity.draft else None
-            published_pk = row.entity.published.version_id if hasattr(row.entity, "published") else None
-            if draft_pk != published_pk:
-                return True
-    return False
+    # The dependencies_hash_digest captures the state of all descendants, so we
+    # can do this quick comparison instead of iterating through layers of
+    # containers.
+    draft_version_hash_digest = draft.log_record.dependencies_hash_digest
+    published_version_hash_digest = published.log_record.dependencies_hash_digest
+    return draft_version_hash_digest != published_version_hash_digest
 def get_containers_with_entity(
     publishable_entity_pk: int,
     *,
     ignore_pinned=False,
+    published=False,
 ) -> QuerySet[Container]:
     """
     [ 🛑 UNSTABLE ]
@@ -1442,20 +1907,31 @@ def get_containers_with_entity(
         publishable_entity_pk: The ID of the PublishableEntity to search for.
         ignore_pinned: if true, ignore any pinned references to the entity.
     """
+    branch = "published" if published else "draft"
     if ignore_pinned:
-        qs = Container.objects.filter(
-            # Note: these two conditions must be in the same filter() call, or the query won't be correct.
-            publishable_entity__draft__version__containerversion__entity_list__entitylistrow__entity_id=publishable_entity_pk,  # pylint: disable=line-too-long # noqa: E501
-            publishable_entity__draft__version__containerversion__entity_list__entitylistrow__entity_version_id=None,  # pylint: disable=line-too-long # noqa: E501
-        )
+        filter_dict = {
+            # Note: these two conditions must be in the same filter() call,
+            # or the query won't be correct.
+            (
+                f"publishable_entity__{branch}__version__"
+                "containerversion__entity_list__entitylistrow__entity_id"
+            ): publishable_entity_pk,
+            (
+                f"publishable_entity__{branch}__version__"
+                "containerversion__entity_list__entitylistrow__entity_version_id"
+            ): None,
+        }
+        qs = Container.objects.filter(**filter_dict)
     else:
-        qs = Container.objects.filter(
-            publishable_entity__draft__version__containerversion__entity_list__entitylistrow__entity_id=publishable_entity_pk,  # pylint: disable=line-too-long # noqa: E501
-        )
-    return qs.select_related(
-        "publishable_entity__draft__version__containerversion",
-        "publishable_entity__published__version__containerversion",
-    ).order_by("pk").distinct()  # Ordering is mostly for consistent test cases.
+        filter_dict = {
+            (
+                f"publishable_entity__{branch}__version__"
+                "containerversion__entity_list__entitylistrow__entity_id"
+            ): publishable_entity_pk
+        }
+        qs = Container.objects.filter(**filter_dict)
+    return qs.order_by("pk").distinct()  # Ordering is mostly for consistent test cases.
 def get_container_children_count(
@@ -1539,6 +2015,6 @@ def bulk_draft_changes_for(
         changed_at=changed_at,
         changed_by=changed_by,
         exit_callbacks=[
-            _create_container_side_effects_for_draft_change_log,
+            _create_side_effects_for_change_log,
         ]
     )

openedx-learning 0.29.0__py2.py3-none-any.whl → 0.30.0__py2.py3-none-any.whl

openedx-learning 0.29.0py2.py3-none-any.whl → 0.30.0py2.py3-none-any.whl