airbyte-source-shopify 2.4.14.dev202407181247__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {airbyte_source_shopify-2.4.14.dev202407181247.dist-info → airbyte_source_shopify-3.1.0.dist-info}/METADATA +4 -4
  2. {airbyte_source_shopify-2.4.14.dev202407181247.dist-info → airbyte_source_shopify-3.1.0.dist-info}/RECORD +25 -27
  3. {airbyte_source_shopify-2.4.14.dev202407181247.dist-info → airbyte_source_shopify-3.1.0.dist-info}/WHEEL +1 -1
  4. source_shopify/auth.py +0 -1
  5. source_shopify/config_migrations.py +4 -1
  6. source_shopify/http_request.py +4 -2
  7. source_shopify/schemas/countries.json +7 -19
  8. source_shopify/schemas/customer_journey_summary.json +228 -148
  9. source_shopify/schemas/deleted_products.json +27 -0
  10. source_shopify/schemas/orders.json +38 -0
  11. source_shopify/schemas/product_variants.json +26 -8
  12. source_shopify/schemas/profile_location_groups.json +10 -0
  13. source_shopify/scopes.py +7 -6
  14. source_shopify/shopify_graphql/bulk/exceptions.py +6 -1
  15. source_shopify/shopify_graphql/bulk/job.py +173 -65
  16. source_shopify/shopify_graphql/bulk/query.py +440 -88
  17. source_shopify/shopify_graphql/bulk/record.py +260 -29
  18. source_shopify/shopify_graphql/bulk/retry.py +12 -12
  19. source_shopify/shopify_graphql/bulk/tools.py +17 -2
  20. source_shopify/source.py +6 -10
  21. source_shopify/spec.json +11 -5
  22. source_shopify/streams/base_streams.py +181 -54
  23. source_shopify/streams/streams.py +211 -58
  24. source_shopify/utils.py +47 -12
  25. source_shopify/schemas/customer_saved_search.json +0 -32
  26. source_shopify/schemas/products_graph_ql.json +0 -123
  27. source_shopify/shopify_graphql/graphql.py +0 -64
  28. source_shopify/shopify_graphql/schema.py +0 -29442
  29. {airbyte_source_shopify-2.4.14.dev202407181247.dist-info → airbyte_source_shopify-3.1.0.dist-info}/entry_points.txt +0 -0
@@ -12,28 +12,28 @@ from urllib.parse import parse_qsl, urlparse
12
12
 
13
13
  import pendulum as pdm
14
14
  import requests
15
- from airbyte_cdk.sources.streams.core import StreamData
16
- from airbyte_cdk.sources.streams.http import HttpClient, HttpStream
17
- from airbyte_cdk.sources.streams.http.error_handlers import ErrorHandler, HttpStatusErrorHandler
18
- from airbyte_cdk.sources.streams.http.error_handlers.default_error_mapping import DEFAULT_ERROR_MAPPING
19
- from airbyte_protocol.models import SyncMode
20
15
  from requests.exceptions import RequestException
21
16
  from source_shopify.http_request import ShopifyErrorHandler
22
17
  from source_shopify.shopify_graphql.bulk.job import ShopifyBulkManager
23
- from source_shopify.shopify_graphql.bulk.query import ShopifyBulkQuery
24
- from source_shopify.shopify_graphql.bulk.record import ShopifyBulkRecord
18
+ from source_shopify.shopify_graphql.bulk.query import DeliveryZoneList, ShopifyBulkQuery
25
19
  from source_shopify.transform import DataTypeEnforcer
20
+ from source_shopify.utils import ApiTypeEnum, ShopifyNonRetryableErrors
26
21
  from source_shopify.utils import EagerlyCachedStreamState as stream_state_cache
27
- from source_shopify.utils import ShopifyNonRetryableErrors
28
22
  from source_shopify.utils import ShopifyRateLimiter as limiter
29
23
 
24
+ from airbyte_cdk.models import SyncMode
25
+ from airbyte_cdk.sources.streams.core import StreamData
26
+ from airbyte_cdk.sources.streams.http import HttpClient, HttpStream
27
+ from airbyte_cdk.sources.streams.http.error_handlers import ErrorHandler, HttpStatusErrorHandler
28
+ from airbyte_cdk.sources.streams.http.error_handlers.default_error_mapping import DEFAULT_ERROR_MAPPING
29
+
30
30
 
31
31
  class ShopifyStream(HttpStream, ABC):
32
32
  # define default logger
33
33
  logger = logging.getLogger("airbyte")
34
34
 
35
35
  # Latest Stable Release
36
- api_version = "2024-04"
36
+ api_version = "2025-01"
37
37
  # Page size
38
38
  limit = 250
39
39
 
@@ -89,7 +89,7 @@ class ShopifyStream(HttpStream, ABC):
89
89
  records = json_response.get(self.data_field, []) if self.data_field is not None else json_response
90
90
  yield from self.produce_records(records)
91
91
  except RequestException as e:
92
- self.logger.warning(f"Unexpected error in `parse_ersponse`: {e}, the actual response data: {response.text}")
92
+ self.logger.warning(f"Unexpected error in `parse_response`: {e}, the actual response data: {response.text}")
93
93
  yield {}
94
94
 
95
95
  def produce_records(
@@ -144,9 +144,8 @@ class ShopifyDeletedEventsStream(ShopifyStream):
144
144
  yield {
145
145
  "id": event["subject_id"],
146
146
  self.cursor_field: event["created_at"],
147
- "updated_at": event["created_at"],
148
- "deleted_message": event["message"],
149
- "deleted_description": event["description"],
147
+ "deleted_message": event.get("message", None),
148
+ "deleted_description": event.get("description", None),
150
149
  "shop_url": event["shop_url"],
151
150
  }
152
151
 
@@ -178,9 +177,19 @@ class ShopifyDeletedEventsStream(ShopifyStream):
178
177
  class IncrementalShopifyStream(ShopifyStream, ABC):
179
178
  # Setting the check point interval to the limit of the records output
180
179
  state_checkpoint_interval = 250
181
- # guarantee for the NestedSubstreams to emit the STATE
182
- # when we have the abnormal STATE distance between Parent and Substream
183
- filter_by_state_checkpoint = False
180
+
181
+ def __init__(self, config: Dict):
182
+ super().__init__(config)
183
+ # _filter_checkpointed_cursor used to checkpoint streams with cursor field - ID in job.get_adjusted_job_end
184
+ self._filter_checkpointed_cursor = None
185
+
186
+ @property
187
+ def filter_by_state_checkpoint(self) -> bool:
188
+ """
189
+ This filtering flag stands to guarantee for the NestedSubstreams to emit the STATE correctly,
190
+ when we have the abnormal STATE distance between Parent and Substream
191
+ """
192
+ return False
184
193
 
185
194
  # Setting the default cursor field for all streams
186
195
  cursor_field = "updated_at"
@@ -212,7 +221,12 @@ class IncrementalShopifyStream(ShopifyStream, ABC):
212
221
  params[self.filter_field] = stream_state.get(self.cursor_field)
213
222
  return params
214
223
 
215
- def track_checkpoint_cursor(self, record_value: Union[str, int]) -> None:
224
+ def track_checkpoint_cursor(self, record_value: Union[str, int], filter_record_value: Optional[str] = None) -> None:
225
+ """
226
+ Tracks _checkpoint_cursor value (values from cursor field) and _filter_checkpointed_cursor value (value from filter field).
227
+ _filter_checkpointed_cursor value is only used when cursor field is ID for streams like Customer Address etc.
228
+ When after canceled/failed job source tries to adjust stream slice (see ShopifyBulkManager._adjust_slice_end()).
229
+ """
216
230
  if self.filter_by_state_checkpoint:
217
231
  # set checkpoint cursor
218
232
  if not self._checkpoint_cursor:
@@ -221,10 +235,14 @@ class IncrementalShopifyStream(ShopifyStream, ABC):
221
235
  if str(record_value) >= str(self._checkpoint_cursor):
222
236
  self._checkpoint_cursor = record_value
223
237
 
238
+ if filter_record_value:
239
+ if not self._filter_checkpointed_cursor or str(filter_record_value) >= str(self._filter_checkpointed_cursor):
240
+ self._filter_checkpointed_cursor = filter_record_value
241
+
224
242
  def should_checkpoint(self, index: int) -> bool:
225
243
  return self.filter_by_state_checkpoint and index >= self.state_checkpoint_interval
226
244
 
227
- # Parse the `stream_slice` with respect to `stream_state` for `Incremental refresh`
245
+ # Parse the `records` with respect to the `stream_state` for the `Incremental refresh`
228
246
  # cases where we slice the stream, the endpoints for those classes don't accept any other filtering,
229
247
  # but they provide us with the updated_at field in most cases, so we used that as incremental filtering during the order slicing.
230
248
  def filter_records_newer_than_state(
@@ -238,7 +256,8 @@ class IncrementalShopifyStream(ShopifyStream, ABC):
238
256
  for index, record in enumerate(records_slice, 1):
239
257
  if self.cursor_field in record:
240
258
  record_value = record.get(self.cursor_field, self.default_state_comparison_value)
241
- self.track_checkpoint_cursor(record_value)
259
+ filter_record_value = record.get(self.filter_field) if self.filter_field else None
260
+ self.track_checkpoint_cursor(record_value, filter_record_value)
242
261
  if record_value:
243
262
  if record_value >= state_value:
244
263
  yield record
@@ -567,6 +586,10 @@ class IncrementalShopifyNestedStream(IncrementalShopifyStream):
567
586
 
568
587
 
569
588
  class IncrementalShopifyStreamWithDeletedEvents(IncrementalShopifyStream):
589
+ def __init__(self, config: Dict) -> None:
590
+ self._stream_state: MutableMapping[str, Any] = {}
591
+ super().__init__(config)
592
+
570
593
  @property
571
594
  @abstractmethod
572
595
  def deleted_events_api_name(self) -> str:
@@ -603,13 +626,13 @@ class IncrementalShopifyStreamWithDeletedEvents(IncrementalShopifyStream):
603
626
  """
604
627
  We extend the stream state with `deleted` property to store the `destroyed` records STATE separetely from the Stream State.
605
628
  """
606
- state = super().get_updated_state(current_stream_state, latest_record)
629
+ self._stream_state = super().get_updated_state(self._stream_state, latest_record)
607
630
  # add `deleted` property to each stream supports `deleted events`,
608
631
  # to provide the `Incremental` sync mode, for the `Incremental Delete` records.
609
632
  last_deleted_record_value = latest_record.get(self.deleted_cursor_field) or self.default_deleted_state_comparison_value
610
633
  current_deleted_state_value = current_stream_state.get(self.deleted_cursor_field) or self.default_deleted_state_comparison_value
611
- state["deleted"] = {self.deleted_cursor_field: max(last_deleted_record_value, current_deleted_state_value)}
612
- return state
634
+ self._stream_state["deleted"] = {self.deleted_cursor_field: max(last_deleted_record_value, current_deleted_state_value)}
635
+ return self._stream_state
613
636
 
614
637
  def read_records(
615
638
  self,
@@ -630,38 +653,60 @@ class IncrementalShopifyGraphQlBulkStream(IncrementalShopifyStream):
630
653
  data_field = "graphql"
631
654
 
632
655
  parent_stream_class: Optional[Union[ShopifyStream, IncrementalShopifyStream]] = None
633
- filter_by_state_checkpoint = True
634
656
 
635
657
  def __init__(self, config: Dict) -> None:
636
658
  super().__init__(config)
637
- # init BULK Query instance, pass `shop_id` from config
638
- self.query = self.bulk_query(shop_id=config.get("shop_id"))
639
659
  # define BULK Manager instance
640
660
  self.job_manager: ShopifyBulkManager = ShopifyBulkManager(
641
661
  http_client=self.bulk_http_client,
642
662
  base_url=f"{self.url_base}{self.path()}",
643
- query=self.query,
663
+ query=self.bulk_query(config, self.parent_stream_query_cursor_alias),
644
664
  job_termination_threshold=float(config.get("job_termination_threshold", 3600)),
645
665
  # overide the default job slice size, if provided (it's auto-adjusted, later on)
646
666
  job_size=config.get("bulk_window_in_days", 30.0),
647
667
  # provide the job checkpoint interval value, default value is 200k lines collected
648
- job_checkpoint_interval=config.get("job_checkpoint_interval", 200000),
668
+ job_checkpoint_interval=config.get("job_checkpoint_interval", 200_000),
669
+ parent_stream_name=self.parent_stream_name,
670
+ parent_stream_cursor=self.parent_stream_cursor,
649
671
  )
650
672
 
651
- # define Record Producer instance
652
- self.record_producer: ShopifyBulkRecord = ShopifyBulkRecord(self.query)
673
+ @property
674
+ def filter_by_state_checkpoint(self) -> bool:
675
+ return self.job_manager._supports_checkpointing
653
676
 
654
677
  @property
655
678
  def bulk_http_client(self) -> HttpClient:
679
+ """
680
+ Returns the instance of the `HttpClient`, with the stream info.
681
+ """
656
682
  return HttpClient(self.name, self.logger, ShopifyErrorHandler(), session=self._http_client._session)
657
683
 
658
684
  @cached_property
659
- def parent_stream(self) -> object:
685
+ def parent_stream(self) -> Union[ShopifyStream, IncrementalShopifyStream]:
660
686
  """
661
687
  Returns the instance of parent stream, if the substream has a `parent_stream_class` dependency.
662
688
  """
663
689
  return self.parent_stream_class(self.config) if self.parent_stream_class else None
664
690
 
691
+ @cached_property
692
+ def parent_stream_name(self) -> Optional[str]:
693
+ """
694
+ Returns the parent stream name, if the substream has a `parent_stream_class` dependency.
695
+ """
696
+ return self.parent_stream.name if self.parent_stream_class else None
697
+
698
+ @cached_property
699
+ def parent_stream_cursor(self) -> Optional[str]:
700
+ """
701
+ Returns the parent stream cursor, if the substream has a `parent_stream_class` dependency.
702
+ """
703
+ return self.parent_stream.cursor_field if self.parent_stream_class else None
704
+
705
+ @cached_property
706
+ def parent_stream_query_cursor_alias(self) -> Optional[str]:
707
+ if self.parent_stream_name and self.parent_stream_cursor:
708
+ return f"{self.parent_stream_name}_{self.parent_stream_cursor}"
709
+
665
710
  @property
666
711
  @abstractmethod
667
712
  def bulk_query(self) -> ShopifyBulkQuery:
@@ -691,7 +736,9 @@ class IncrementalShopifyGraphQlBulkStream(IncrementalShopifyStream):
691
736
  return None
692
737
 
693
738
  def get_updated_state(
694
- self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]
739
+ self,
740
+ current_stream_state: MutableMapping[str, Any],
741
+ latest_record: Mapping[str, Any],
695
742
  ) -> MutableMapping[str, Any]:
696
743
  """UPDATING THE STATE OBJECT:
697
744
  Stream: CustomerAddress
@@ -706,37 +753,75 @@ class IncrementalShopifyGraphQlBulkStream(IncrementalShopifyStream):
706
753
  }
707
754
  }
708
755
  """
756
+
709
757
  updated_state = super().get_updated_state(current_stream_state, latest_record)
758
+
710
759
  if self.parent_stream_class:
760
+ # the default way of getting the parent stream state is to use the value from the RecordProducer,
761
+ # since the parent record could be present but no substream's records are present to emit,
762
+ # the parent state is tracked for each parent record processed, thus updated regardless having substream records.
763
+ tracked_parent_state = self.job_manager.record_producer.get_parent_stream_state()
764
+ # fallback to the record level to search for the parent cursor or use the stream cursor value
765
+ parent_state = tracked_parent_state if tracked_parent_state else self._get_parent_state_from_record(latest_record)
711
766
  # add parent_stream_state to `updated_state`
712
- updated_state[self.parent_stream.name] = {self.parent_stream.cursor_field: latest_record.get(self.parent_stream.cursor_field)}
767
+ updated_state[self.parent_stream_name] = parent_state
768
+
713
769
  return updated_state
714
770
 
715
- def get_stream_state_value(self, stream_state: Optional[Mapping[str, Any]]) -> str:
716
- if self.parent_stream_class:
717
- # get parent stream state from the stream_state object.
718
- parent_state = stream_state.get(self.parent_stream.name, {})
719
- if parent_state:
720
- return parent_state.get(self.parent_stream.cursor_field, self.default_state_comparison_value)
721
- else:
722
- # get the stream state, if no `parent_stream_class` was assigned.
771
+ def _get_parent_state_from_record(self, latest_record: Mapping[str, Any]) -> MutableMapping[str, Any]:
772
+ parent_state = latest_record.get(self.parent_stream_name, {})
773
+ parent_state_value = parent_state.get(self.parent_stream_cursor) if parent_state else latest_record.get(self.parent_stream_cursor)
774
+ parent_state[self.parent_stream_cursor] = parent_state_value
775
+ return parent_state
776
+
777
+ def _get_stream_cursor_value(self, stream_state: Optional[Mapping[str, Any]] = None) -> Optional[str]:
778
+ if stream_state:
723
779
  return stream_state.get(self.cursor_field, self.default_state_comparison_value)
780
+ else:
781
+ return self.config.get("start_date")
724
782
 
725
- def get_state_value(self, stream_state: Mapping[str, Any] = None) -> Optional[Union[str, int]]:
783
+ def _get_stream_state_value(self, stream_state: Optional[Mapping[str, Any]] = None) -> Optional[str]:
726
784
  if stream_state:
727
- return self.get_stream_state_value(stream_state)
785
+ if self.parent_stream_class:
786
+ # get parent stream state from the stream_state object.
787
+ parent_state = stream_state.get(self.parent_stream_name, {})
788
+ if parent_state:
789
+ return parent_state.get(self.parent_stream_cursor, self.default_state_comparison_value)
790
+ else:
791
+ # use the streams cursor value, if no parent state available
792
+ return self._get_stream_cursor_value(stream_state)
793
+ else:
794
+ # get the stream state, if no `parent_stream_class` was assigned.
795
+ return self._get_stream_cursor_value(stream_state)
796
+ else:
797
+ return self.config.get("start_date")
798
+
799
+ def _get_state_value(self, stream_state: Optional[Mapping[str, Any]] = None) -> Optional[Union[str, int]]:
800
+ if stream_state:
801
+ return self._get_stream_state_value(stream_state)
728
802
  else:
729
803
  # for majority of cases we fallback to start_date, otherwise.
730
804
  return self.config.get("start_date")
731
805
 
732
806
  def emit_slice_message(self, slice_start: datetime, slice_end: datetime) -> None:
733
807
  slice_size_message = f"Slice size: `P{round(self.job_manager._job_size, 1)}D`"
734
- self.logger.info(f"Stream: `{self.name}` requesting BULK Job for period: {slice_start} -- {slice_end}. {slice_size_message}")
808
+ slice_message = f"Stream: `{self.name}` requesting BULK Job for period: {slice_start} -- {slice_end}. {slice_size_message}."
809
+
810
+ if self.job_manager._supports_checkpointing:
811
+ checkpointing_message = f" The BULK checkpoint after `{self.job_manager.job_checkpoint_interval}` lines."
812
+ else:
813
+ checkpointing_message = f" The BULK checkpointing is not supported."
814
+
815
+ self.logger.info(slice_message + checkpointing_message)
816
+
817
+ def emit_checkpoint_message(self) -> None:
818
+ if self.job_manager._job_adjust_slice_from_checkpoint:
819
+ self.logger.info(f"Stream {self.name}, continue from checkpoint: `{self._checkpoint_cursor}`.")
735
820
 
736
821
  @stream_state_cache.cache_stream_state
737
822
  def stream_slices(self, stream_state: Optional[Mapping[str, Any]] = None, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
738
823
  if self.filter_field:
739
- state = self.get_state_value(stream_state)
824
+ state = self._get_state_value(stream_state)
740
825
  start = pdm.parse(state)
741
826
  end = pdm.now()
742
827
  while start < end:
@@ -745,11 +830,28 @@ class IncrementalShopifyGraphQlBulkStream(IncrementalShopifyStream):
745
830
  self.emit_slice_message(start, slice_end)
746
831
  yield {"start": start.to_rfc3339_string(), "end": slice_end.to_rfc3339_string()}
747
832
  # increment the end of the slice or reduce the next slice
748
- start = self.job_manager.get_adjusted_job_end(start, slice_end, self._checkpoint_cursor)
833
+ start = self.job_manager.get_adjusted_job_end(start, slice_end, self._checkpoint_cursor, self._filter_checkpointed_cursor)
749
834
  else:
750
835
  # for the streams that don't support filtering
751
836
  yield {}
752
837
 
838
+ def sort_output_asc(self, non_sorted_records: Iterable[Mapping[str, Any]] = None) -> Iterable[Mapping[str, Any]]:
839
+ """
840
+ Apply sorting for collected records, to guarantee the `ASC` output.
841
+ This handles the STATE and CHECKPOINTING correctly, for the `incremental` streams.
842
+ """
843
+ if non_sorted_records:
844
+ if not self.cursor_field:
845
+ yield from non_sorted_records
846
+ else:
847
+ yield from sorted(
848
+ non_sorted_records,
849
+ key=lambda x: x.get(self.cursor_field) if x.get(self.cursor_field) else self.default_state_comparison_value,
850
+ )
851
+ else:
852
+ # always return an empty iterable, if no records
853
+ return []
854
+
753
855
  def read_records(
754
856
  self,
755
857
  sync_mode: SyncMode,
@@ -759,13 +861,38 @@ class IncrementalShopifyGraphQlBulkStream(IncrementalShopifyStream):
759
861
  ) -> Iterable[StreamData]:
760
862
  self.job_manager.create_job(stream_slice, self.filter_field)
761
863
  stream_state = stream_state_cache.cached_state.get(self.name, {self.cursor_field: self.default_state_comparison_value})
864
+ # add `shop_url` field to each record produced
865
+ records = self.add_shop_url_field(
866
+ # produce records from saved bulk job result
867
+ self.job_manager.job_get_results()
868
+ )
869
+ # emit records in ASC order
870
+ yield from self.filter_records_newer_than_state(stream_state, self.sort_output_asc(records))
871
+ # add log message about the checkpoint value
872
+ self.emit_checkpoint_message()
873
+
874
+
875
+ class FullRefreshShopifyGraphQlBulkStream(ShopifyStream):
876
+ data_field = "graphql"
877
+ http_method = "POST"
878
+
879
+ query: DeliveryZoneList
880
+ response_field: str
762
881
 
763
- filename = self.job_manager.job_check_for_completion()
764
- # the `filename` could be `None`, meaning there are no data available for the slice period.
765
- if filename:
766
- # add `shop_url` field to each record produced
767
- records = self.add_shop_url_field(
768
- # produce records from saved bulk job result
769
- self.record_producer.read_file(filename)
770
- )
771
- yield from self.filter_records_newer_than_state(stream_state, records)
882
+ def request_body_json(
883
+ self,
884
+ stream_state: Optional[Mapping[str, Any]],
885
+ stream_slice: Optional[Mapping[str, Any]] = None,
886
+ next_page_token: Optional[Mapping[str, Any]] = None,
887
+ ) -> Optional[Mapping[str, Any]]:
888
+ return {"query": self.query().get()}
889
+
890
+ @limiter.balance_rate_limit(api_type=ApiTypeEnum.graphql.value)
891
+ def parse_response(self, response: requests.Response, **kwargs) -> Iterable[Mapping]:
892
+ if response.status_code is requests.codes.OK:
893
+ try:
894
+ json_response = response.json().get("data", {}).get(self.response_field, {}).get("nodes", [])
895
+ yield from json_response
896
+ except RequestException as e:
897
+ self.logger.warning(f"Unexpected error in `parse_response`: {e}, the actual response data: {response.text}")
898
+ yield {}