arkindex-base-worker 0.4.0__py3-none-any.whl → 0.4.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/METADATA +13 -15
  2. arkindex_base_worker-0.4.0a2.dist-info/RECORD +51 -0
  3. {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/WHEEL +1 -1
  4. arkindex_worker/cache.py +1 -1
  5. arkindex_worker/image.py +1 -120
  6. arkindex_worker/utils.py +0 -82
  7. arkindex_worker/worker/__init__.py +161 -46
  8. arkindex_worker/worker/base.py +11 -36
  9. arkindex_worker/worker/classification.py +18 -34
  10. arkindex_worker/worker/corpus.py +4 -21
  11. arkindex_worker/worker/dataset.py +1 -71
  12. arkindex_worker/worker/element.py +91 -352
  13. arkindex_worker/worker/entity.py +11 -11
  14. arkindex_worker/worker/metadata.py +9 -19
  15. arkindex_worker/worker/task.py +4 -5
  16. arkindex_worker/worker/training.py +6 -6
  17. arkindex_worker/worker/transcription.py +68 -89
  18. arkindex_worker/worker/version.py +1 -3
  19. tests/__init__.py +1 -1
  20. tests/conftest.py +45 -33
  21. tests/test_base_worker.py +3 -204
  22. tests/test_dataset_worker.py +4 -7
  23. tests/test_elements_worker/{test_classification.py → test_classifications.py} +61 -194
  24. tests/test_elements_worker/test_corpus.py +1 -32
  25. tests/test_elements_worker/test_dataset.py +1 -1
  26. tests/test_elements_worker/test_elements.py +2734 -0
  27. tests/test_elements_worker/{test_entity_create.py → test_entities.py} +160 -26
  28. tests/test_elements_worker/test_image.py +1 -2
  29. tests/test_elements_worker/test_metadata.py +99 -224
  30. tests/test_elements_worker/test_task.py +1 -1
  31. tests/test_elements_worker/test_training.py +2 -2
  32. tests/test_elements_worker/test_transcriptions.py +2102 -0
  33. tests/test_elements_worker/test_worker.py +280 -563
  34. tests/test_image.py +204 -429
  35. tests/test_merge.py +2 -1
  36. tests/test_utils.py +3 -66
  37. arkindex_base_worker-0.4.0.dist-info/RECORD +0 -61
  38. arkindex_worker/worker/process.py +0 -92
  39. tests/test_elements_worker/test_element.py +0 -427
  40. tests/test_elements_worker/test_element_create_multiple.py +0 -715
  41. tests/test_elements_worker/test_element_create_single.py +0 -528
  42. tests/test_elements_worker/test_element_list_children.py +0 -969
  43. tests/test_elements_worker/test_element_list_parents.py +0 -530
  44. tests/test_elements_worker/test_entity_list_and_check.py +0 -160
  45. tests/test_elements_worker/test_process.py +0 -89
  46. tests/test_elements_worker/test_transcription_create.py +0 -873
  47. tests/test_elements_worker/test_transcription_create_with_elements.py +0 -951
  48. tests/test_elements_worker/test_transcription_list.py +0 -450
  49. tests/test_elements_worker/test_version.py +0 -60
  50. {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/LICENSE +0 -0
  51. {arkindex_base_worker-0.4.0.dist-info → arkindex_base_worker-0.4.0a2.dist-info}/top_level.txt +0 -0
@@ -3,11 +3,10 @@ import re
3
3
  from uuid import UUID
4
4
 
5
5
  import pytest
6
+ from apistar.exceptions import ErrorResponse
6
7
 
7
- from arkindex.exceptions import ErrorResponse
8
8
  from arkindex_worker.cache import CachedClassification, CachedElement
9
9
  from arkindex_worker.models import Element
10
- from arkindex_worker.utils import DEFAULT_BATCH_SIZE
11
10
  from tests import CORPUS_ID
12
11
 
13
12
  from . import BASE_API_CALLS
@@ -17,92 +16,6 @@ from . import BASE_API_CALLS
17
16
  DELETE_PARAMETER = "DELETE_PARAMETER"
18
17
 
19
18
 
20
- def test_load_corpus_classes_api_error(responses, mock_elements_worker):
21
- responses.add(
22
- responses.GET,
23
- f"http://testserver/api/v1/corpus/{CORPUS_ID}/classes/",
24
- status=418,
25
- )
26
-
27
- assert not mock_elements_worker.classes
28
- with pytest.raises(
29
- Exception, match="Stopping pagination as data will be incomplete"
30
- ):
31
- mock_elements_worker.load_corpus_classes()
32
-
33
- assert len(responses.calls) == len(BASE_API_CALLS) + 5
34
- assert [
35
- (call.request.method, call.request.url) for call in responses.calls
36
- ] == BASE_API_CALLS + [
37
- # We do 5 retries
38
- (
39
- "GET",
40
- f"http://testserver/api/v1/corpus/{CORPUS_ID}/classes/",
41
- ),
42
- (
43
- "GET",
44
- f"http://testserver/api/v1/corpus/{CORPUS_ID}/classes/",
45
- ),
46
- (
47
- "GET",
48
- f"http://testserver/api/v1/corpus/{CORPUS_ID}/classes/",
49
- ),
50
- (
51
- "GET",
52
- f"http://testserver/api/v1/corpus/{CORPUS_ID}/classes/",
53
- ),
54
- (
55
- "GET",
56
- f"http://testserver/api/v1/corpus/{CORPUS_ID}/classes/",
57
- ),
58
- ]
59
- assert not mock_elements_worker.classes
60
-
61
-
62
- def test_load_corpus_classes(responses, mock_elements_worker):
63
- responses.add(
64
- responses.GET,
65
- f"http://testserver/api/v1/corpus/{CORPUS_ID}/classes/",
66
- status=200,
67
- json={
68
- "count": 3,
69
- "next": None,
70
- "results": [
71
- {
72
- "id": "0000",
73
- "name": "good",
74
- },
75
- {
76
- "id": "1111",
77
- "name": "average",
78
- },
79
- {
80
- "id": "2222",
81
- "name": "bad",
82
- },
83
- ],
84
- },
85
- )
86
-
87
- assert not mock_elements_worker.classes
88
- mock_elements_worker.load_corpus_classes()
89
-
90
- assert len(responses.calls) == len(BASE_API_CALLS) + 1
91
- assert [
92
- (call.request.method, call.request.url) for call in responses.calls
93
- ] == BASE_API_CALLS + [
94
- (
95
- "GET",
96
- f"http://testserver/api/v1/corpus/{CORPUS_ID}/classes/",
97
- ),
98
- ]
99
- assert mock_elements_worker.classes == {
100
- "good": "0000",
101
- "average": "1111",
102
- "bad": "2222",
103
- }
104
-
105
-
106
19
  def test_get_ml_class_id_load_classes(responses, mock_elements_worker):
107
20
  responses.add(
108
21
  responses.GET,
@@ -779,8 +692,7 @@ def test_create_classifications_create_ml_class(mock_elements_worker, responses)
779
692
  }
780
693
 
781
694
 
782
- @pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
783
- def test_create_classifications(batch_size, responses, mock_elements_worker):
695
+ def test_create_classifications(responses, mock_elements_worker):
784
696
  mock_elements_worker.classes = {"portrait": "0000", "landscape": "1111"}
785
697
  elt = Element({"id": "12341234-1234-1234-1234-123412341234"})
786
698
  responses.add(
@@ -804,98 +716,62 @@ def test_create_classifications(batch_size, responses, mock_elements_worker):
804
716
  "high_confidence": False,
805
717
  },
806
718
  ],
807
- batch_size=batch_size,
808
719
  )
809
720
 
810
- bulk_api_calls = [("POST", "http://testserver/api/v1/classification/bulk/")]
811
- if batch_size != DEFAULT_BATCH_SIZE:
812
- bulk_api_calls.append(("POST", "http://testserver/api/v1/classification/bulk/"))
813
-
814
- assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
721
+ assert len(responses.calls) == len(BASE_API_CALLS) + 1
815
722
  assert [
816
723
  (call.request.method, call.request.url) for call in responses.calls
817
- ] == BASE_API_CALLS + bulk_api_calls
724
+ ] == BASE_API_CALLS + [
725
+ ("POST", "http://testserver/api/v1/classification/bulk/"),
726
+ ]
818
727
 
819
- first_cl = {"confidence": 0.75, "high_confidence": False, "ml_class": "0000"}
820
- second_cl = {"confidence": 0.25, "high_confidence": False, "ml_class": "1111"}
821
- empty_payload = {
728
+ assert json.loads(responses.calls[-1].request.body) == {
822
729
  "parent": str(elt.id),
823
730
  "worker_run_id": "56785678-5678-5678-5678-567856785678",
824
- "classifications": [],
731
+ "classifications": [
732
+ {
733
+ "confidence": 0.75,
734
+ "high_confidence": False,
735
+ "ml_class": "0000",
736
+ },
737
+ {
738
+ "confidence": 0.25,
739
+ "high_confidence": False,
740
+ "ml_class": "1111",
741
+ },
742
+ ],
825
743
  }
826
744
 
827
- bodies = []
828
- first_call_idx = None
829
- if batch_size > 1:
830
- first_call_idx = -1
831
- bodies.append({**empty_payload, "classifications": [first_cl, second_cl]})
832
- else:
833
- first_call_idx = -2
834
- bodies.append({**empty_payload, "classifications": [first_cl]})
835
- bodies.append({**empty_payload, "classifications": [second_cl]})
836
-
837
- assert [
838
- json.loads(bulk_call.request.body)
839
- for bulk_call in responses.calls[first_call_idx:]
840
- ] == bodies
841
-
842
745
 
843
- @pytest.mark.parametrize("batch_size", [DEFAULT_BATCH_SIZE, 1])
844
- def test_create_classifications_with_cache(
845
- batch_size, responses, mock_elements_worker_with_cache
846
- ):
746
+ def test_create_classifications_with_cache(responses, mock_elements_worker_with_cache):
847
747
  mock_elements_worker_with_cache.classes = {"portrait": "0000", "landscape": "1111"}
848
748
  elt = CachedElement.create(id="12341234-1234-1234-1234-123412341234", type="thing")
849
749
 
850
- if batch_size > 1:
851
- responses.add(
852
- responses.POST,
853
- "http://testserver/api/v1/classification/bulk/",
854
- status=200,
855
- json={
856
- "parent": str(elt.id),
857
- "worker_run_id": "56785678-5678-5678-5678-567856785678",
858
- "classifications": [
859
- {
860
- "id": "00000000-0000-0000-0000-000000000000",
861
- "ml_class": "0000",
862
- "confidence": 0.75,
863
- "high_confidence": False,
864
- "state": "pending",
865
- },
866
- {
867
- "id": "11111111-1111-1111-1111-111111111111",
868
- "ml_class": "1111",
869
- "confidence": 0.25,
870
- "high_confidence": False,
871
- "state": "pending",
872
- },
873
- ],
874
- },
875
- )
876
- else:
877
- for cl_id, cl_class, cl_conf in [
878
- ("00000000-0000-0000-0000-000000000000", "0000", 0.75),
879
- ("11111111-1111-1111-1111-111111111111", "1111", 0.25),
880
- ]:
881
- responses.add(
882
- responses.POST,
883
- "http://testserver/api/v1/classification/bulk/",
884
- status=200,
885
- json={
886
- "parent": str(elt.id),
887
- "worker_run_id": "56785678-5678-5678-5678-567856785678",
888
- "classifications": [
889
- {
890
- "id": cl_id,
891
- "ml_class": cl_class,
892
- "confidence": cl_conf,
893
- "high_confidence": False,
894
- "state": "pending",
895
- },
896
- ],
750
+ responses.add(
751
+ responses.POST,
752
+ "http://testserver/api/v1/classification/bulk/",
753
+ status=200,
754
+ json={
755
+ "parent": str(elt.id),
756
+ "worker_run_id": "56785678-5678-5678-5678-567856785678",
757
+ "classifications": [
758
+ {
759
+ "id": "00000000-0000-0000-0000-000000000000",
760
+ "ml_class": "0000",
761
+ "confidence": 0.75,
762
+ "high_confidence": False,
763
+ "state": "pending",
764
+ },
765
+ {
766
+ "id": "11111111-1111-1111-1111-111111111111",
767
+ "ml_class": "1111",
768
+ "confidence": 0.25,
769
+ "high_confidence": False,
770
+ "state": "pending",
897
771
  },
898
- )
772
+ ],
773
+ },
774
+ )
899
775
 
900
776
  mock_elements_worker_with_cache.create_classifications(
901
777
  element=elt,
@@ -911,41 +787,32 @@ def test_create_classifications_with_cache(
911
787
  "high_confidence": False,
912
788
  },
913
789
  ],
914
- batch_size=batch_size,
915
790
  )
916
791
 
917
- bulk_api_calls = [("POST", "http://testserver/api/v1/classification/bulk/")]
918
- if batch_size != DEFAULT_BATCH_SIZE:
919
- bulk_api_calls.append(("POST", "http://testserver/api/v1/classification/bulk/"))
920
-
921
- assert len(responses.calls) == len(BASE_API_CALLS) + len(bulk_api_calls)
792
+ assert len(responses.calls) == len(BASE_API_CALLS) + 1
922
793
  assert [
923
794
  (call.request.method, call.request.url) for call in responses.calls
924
- ] == BASE_API_CALLS + bulk_api_calls
795
+ ] == BASE_API_CALLS + [
796
+ ("POST", "http://testserver/api/v1/classification/bulk/"),
797
+ ]
925
798
 
926
- first_cl = {"confidence": 0.75, "high_confidence": False, "ml_class": "0000"}
927
- second_cl = {"confidence": 0.25, "high_confidence": False, "ml_class": "1111"}
928
- empty_payload = {
799
+ assert json.loads(responses.calls[-1].request.body) == {
929
800
  "parent": str(elt.id),
930
801
  "worker_run_id": "56785678-5678-5678-5678-567856785678",
931
- "classifications": [],
802
+ "classifications": [
803
+ {
804
+ "confidence": 0.75,
805
+ "high_confidence": False,
806
+ "ml_class": "0000",
807
+ },
808
+ {
809
+ "confidence": 0.25,
810
+ "high_confidence": False,
811
+ "ml_class": "1111",
812
+ },
813
+ ],
932
814
  }
933
815
 
934
- bodies = []
935
- first_call_idx = None
936
- if batch_size > 1:
937
- first_call_idx = -1
938
- bodies.append({**empty_payload, "classifications": [first_cl, second_cl]})
939
- else:
940
- first_call_idx = -2
941
- bodies.append({**empty_payload, "classifications": [first_cl]})
942
- bodies.append({**empty_payload, "classifications": [second_cl]})
943
-
944
- assert [
945
- json.loads(bulk_call.request.body)
946
- for bulk_call in responses.calls[first_call_idx:]
947
- ] == bodies
948
-
949
816
  # Check that created classifications were properly stored in SQLite cache
950
817
  assert list(CachedClassification.select()) == [
951
818
  CachedClassification(
@@ -2,44 +2,13 @@ import re
2
2
  import uuid
3
3
 
4
4
  import pytest
5
+ from apistar.exceptions import ErrorResponse
5
6
 
6
- from arkindex.exceptions import ErrorResponse
7
7
  from arkindex_worker.worker.corpus import CorpusExportState
8
8
  from tests import CORPUS_ID
9
9
  from tests.test_elements_worker import BASE_API_CALLS
10
10
 
11
11
 
12
- def test_download_export_not_a_uuid(responses, mock_elements_worker):
13
- with pytest.raises(ValueError, match="export_id is not a valid uuid."):
14
- mock_elements_worker.download_export("mon export")
15
-
16
-
17
- def test_download_export(responses, mock_elements_worker):
18
- responses.add(
19
- responses.GET,
20
- "http://testserver/api/v1/export/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff/",
21
- status=302,
22
- body=b"some SQLite export",
23
- content_type="application/x-sqlite3",
24
- stream=True,
25
- )
26
-
27
- export = mock_elements_worker.download_export(
28
- "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff"
29
- )
30
- assert export.name == "/tmp/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff"
31
-
32
- assert len(responses.calls) == len(BASE_API_CALLS) + 1
33
- assert [
34
- (call.request.method, call.request.url) for call in responses.calls
35
- ] == BASE_API_CALLS + [
36
- (
37
- "GET",
38
- "http://testserver/api/v1/export/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeffff/",
39
- ),
40
- ]
41
-
42
-
43
12
  def mock_list_exports_call(responses, export_id):
44
13
  responses.add(
45
14
  responses.GET,
@@ -2,8 +2,8 @@ import json
2
2
  import logging
3
3
 
4
4
  import pytest
5
+ from apistar.exceptions import ErrorResponse
5
6
 
6
- from arkindex.exceptions import ErrorResponse
7
7
  from arkindex_worker.models import Dataset, Element, Set
8
8
  from arkindex_worker.worker.dataset import DatasetState
9
9
  from tests import PROCESS_ID