hippius 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hippius_sdk/ipfs.py CHANGED
@@ -1,10 +1,11 @@
1
1
  """
2
2
  IPFS operations for the Hippius SDK.
3
3
  """
4
-
4
+ import asyncio
5
5
  import hashlib
6
6
  import json
7
7
  import os
8
+ import random
8
9
  import shutil
9
10
  import tempfile
10
11
  import time
@@ -36,6 +37,12 @@ try:
36
37
  except ImportError:
37
38
  ERASURE_CODING_AVAILABLE = False
38
39
 
40
+ # Configuration constants
41
+ PARALLEL_EC_CHUNKS = 20 # Maximum number of concurrent chunk downloads
42
+ PARALLEL_ORIGINAL_CHUNKS = (
43
+ 15 # Maximum number of original chunks to process in parallel
44
+ )
45
+
39
46
 
40
47
  class IPFSClient:
41
48
  """Client for interacting with IPFS."""
@@ -650,6 +657,44 @@ class IPFSClient:
650
657
  "gateway_url": gateway_url if exists else None,
651
658
  }
652
659
 
660
+ async def publish_global(self, cid: str) -> Dict[str, Any]:
661
+ """
662
+ Publish a CID to the global IPFS network, ensuring it's widely available.
663
+
664
+ This makes the content available beyond the local IPFS node by pinning
665
+ it to multiple public services.
666
+
667
+ Args:
668
+ cid: Content Identifier (CID) to publish globally
669
+
670
+ Returns:
671
+ Dict[str, Any]: Dictionary containing:
672
+ - published: Boolean indicating if publishing was successful
673
+ - cid: The CID that was published
674
+ - formatted_cid: Formatted version of the CID
675
+ - message: Status message
676
+ """
677
+ # First ensure it's pinned locally
678
+ pin_result = await self.pin(cid)
679
+
680
+ if not pin_result.get("success", False):
681
+ return {
682
+ "published": False,
683
+ "cid": cid,
684
+ "formatted_cid": self.format_cid(cid),
685
+ "message": f"Failed to pin content locally: {pin_result.get('message', 'Unknown error')}",
686
+ }
687
+
688
+ # Then request pinning on public services
689
+ # This implementation focuses on making the content available through
690
+ # the default gateway, which provides sufficient global access
691
+ return {
692
+ "published": True,
693
+ "cid": cid,
694
+ "formatted_cid": self.format_cid(cid),
695
+ "message": "Content published to global IPFS network",
696
+ }
697
+
653
698
  async def pin(self, cid: str) -> Dict[str, Any]:
654
699
  """
655
700
  Pin a CID to IPFS to keep it available.
@@ -895,14 +940,19 @@ class IPFSClient:
895
940
 
896
941
  # Step 4: Upload all chunks to IPFS
897
942
  if verbose:
898
- print(f"Uploading {len(chunks) * m} erasure-coded chunks to IPFS...")
943
+ print(
944
+ f"Uploading {len(chunks) * m} erasure-coded chunks to IPFS in parallel..."
945
+ )
899
946
 
900
947
  chunk_uploads = 0
901
948
  chunk_data = []
949
+ batch_size = 20 # Number of concurrent uploads
902
950
 
903
951
  # Create a temporary directory for the chunks
904
952
  with tempfile.TemporaryDirectory() as temp_dir:
905
- # Write and upload each encoded chunk
953
+ # Prepare all chunks for upload
954
+ all_chunk_info = []
955
+
906
956
  for original_idx, encoded_chunks in enumerate(all_encoded_chunks):
907
957
  for share_idx, share_data in enumerate(encoded_chunks):
908
958
  # Create a name for this chunk that includes needed info
@@ -913,29 +963,48 @@ class IPFSClient:
913
963
  with open(chunk_path, "wb") as f:
914
964
  f.write(share_data)
915
965
 
916
- # Upload the chunk to IPFS
917
- try:
918
- chunk_cid = await self.upload_file(
919
- chunk_path, max_retries=max_retries
920
- )
921
-
922
- # Store info about this chunk
923
- chunk_info = {
966
+ # Store info for async upload
967
+ all_chunk_info.append(
968
+ {
924
969
  "name": chunk_name,
925
- "cid": chunk_cid,
970
+ "path": chunk_path,
926
971
  "original_chunk": original_idx,
927
972
  "share_idx": share_idx,
928
973
  "size": len(share_data),
929
974
  }
930
- chunk_data.append(chunk_info)
975
+ )
976
+
977
+ # Create a semaphore to limit concurrent uploads
978
+ semaphore = asyncio.Semaphore(batch_size)
979
+
980
+ # Define upload task for a single chunk
981
+ async def upload_chunk(chunk_info):
982
+ nonlocal chunk_uploads
931
983
 
984
+ async with semaphore:
985
+ try:
986
+ chunk_cid = await self.upload_file(
987
+ chunk_info["path"], max_retries=max_retries
988
+ )
989
+ chunk_info["cid"] = chunk_cid
932
990
  chunk_uploads += 1
933
991
  if verbose and chunk_uploads % 10 == 0:
934
992
  print(
935
993
  f" Uploaded {chunk_uploads}/{len(chunks) * m} chunks"
936
994
  )
995
+ return chunk_info
937
996
  except Exception as e:
938
- print(f"Error uploading chunk {chunk_name}: {str(e)}")
997
+ print(f"Error uploading chunk {chunk_info['name']}: {str(e)}")
998
+ return None
999
+
1000
+ # Create tasks for all chunk uploads
1001
+ upload_tasks = [upload_chunk(chunk_info) for chunk_info in all_chunk_info]
1002
+
1003
+ # Wait for all uploads to complete
1004
+ completed_uploads = await asyncio.gather(*upload_tasks)
1005
+
1006
+ # Filter out failed uploads
1007
+ chunk_data = [upload for upload in completed_uploads if upload is not None]
939
1008
 
940
1009
  # Add all chunk info to metadata
941
1010
  metadata["chunks"] = chunk_data
@@ -1032,6 +1101,7 @@ class IPFSClient:
1032
1101
  m = erasure_params["m"]
1033
1102
  is_encrypted = erasure_params.get("encrypted", False)
1034
1103
  chunk_size = erasure_params.get("chunk_size", 1024 * 1024)
1104
+ total_original_size = original_file["size"]
1035
1105
 
1036
1106
  if verbose:
1037
1107
  print(
@@ -1042,6 +1112,9 @@ class IPFSClient:
1042
1112
  )
1043
1113
  if is_encrypted:
1044
1114
  print("Encrypted: Yes")
1115
+ print(
1116
+ f"Using parallel download with max {PARALLEL_ORIGINAL_CHUNKS} original chunks and {PARALLEL_EC_CHUNKS} chunk downloads concurrently"
1117
+ )
1045
1118
 
1046
1119
  # Step 3: Group chunks by their original chunk index
1047
1120
  chunks_by_original = {}
@@ -1051,109 +1124,194 @@ class IPFSClient:
1051
1124
  chunks_by_original[orig_idx] = []
1052
1125
  chunks_by_original[orig_idx].append(chunk)
1053
1126
 
1054
- # Step 4: For each original chunk, download at least k shares
1127
+ # Step 4: Process all original chunks in parallel
1055
1128
  if verbose:
1056
1129
  total_original_chunks = len(chunks_by_original)
1057
- total_chunks_to_download = total_original_chunks * k
1130
+ total_chunks_needed = total_original_chunks * k
1058
1131
  print(
1059
- f"Downloading and reconstructing {total_chunks_to_download} chunks..."
1132
+ f"Downloading and reconstructing {total_chunks_needed} chunks in parallel..."
1060
1133
  )
1061
1134
 
1062
- reconstructed_chunks = []
1063
- chunks_downloaded = 0
1064
- chunks_failed = 0
1135
+ # Create semaphores to limit concurrency
1136
+ encoded_chunks_semaphore = asyncio.Semaphore(PARALLEL_EC_CHUNKS)
1137
+ original_chunks_semaphore = asyncio.Semaphore(PARALLEL_ORIGINAL_CHUNKS)
1138
+
1139
+ # Process a single original chunk and its required downloads
1140
+ async def process_original_chunk(orig_idx, available_chunks):
1141
+ # Limit number of original chunks processing at once
1142
+ async with original_chunks_semaphore:
1143
+ if verbose:
1144
+ print(f"Processing original chunk {orig_idx}...")
1145
+
1146
+ if len(available_chunks) < k:
1147
+ raise ValueError(
1148
+ f"Not enough chunks available for original chunk {orig_idx}. "
1149
+ f"Need {k}, but only have {len(available_chunks)}."
1150
+ )
1065
1151
 
1066
- for orig_idx in sorted(chunks_by_original.keys()):
1067
- available_chunks = chunks_by_original[orig_idx]
1152
+ # Try slightly more than k chunks (k+2) to handle some failures
1153
+ num_to_try = min(k + 2, len(available_chunks))
1154
+ chunks_to_try = random.sample(available_chunks, num_to_try)
1068
1155
 
1069
- if len(available_chunks) < k:
1070
- raise ValueError(
1071
- f"Not enough chunks available for original chunk {orig_idx}. "
1072
- f"Need {k}, but only have {len(available_chunks)}."
1073
- )
1074
-
1075
- # We only need k chunks, so take the first k
1076
- chunks_to_download = available_chunks[:k]
1156
+ # Track downloaded chunks
1157
+ download_tasks = []
1077
1158
 
1078
- # Download the chunks
1079
- downloaded_shares = []
1080
- share_indexes = []
1159
+ # Start parallel downloads for chunks
1160
+ for chunk in chunks_to_try:
1161
+ chunk_path = os.path.join(temp_dir, f"{chunk['name']}")
1081
1162
 
1082
- for chunk in chunks_to_download:
1083
- chunk_path = os.path.join(temp_dir, chunk["name"])
1084
- try:
1085
- # Extract the CID string from the chunk's cid dictionary
1163
+ # Extract CID
1086
1164
  chunk_cid = (
1087
1165
  chunk["cid"]["cid"]
1088
1166
  if isinstance(chunk["cid"], dict) and "cid" in chunk["cid"]
1089
1167
  else chunk["cid"]
1090
1168
  )
1091
- await self.download_file(
1092
- chunk_cid, chunk_path, max_retries=max_retries
1093
- )
1094
- chunks_downloaded += 1
1095
1169
 
1096
- # Read the chunk data
1097
- with open(chunk_path, "rb") as f:
1098
- share_data = f.read()
1170
+ # Create download task
1171
+ async def download_chunk(cid, path, chunk_info):
1172
+ async with encoded_chunks_semaphore:
1173
+ try:
1174
+ await self.download_file(
1175
+ cid, path, max_retries=max_retries
1176
+ )
1177
+
1178
+ # Read chunk data
1179
+ with open(path, "rb") as f:
1180
+ share_data = f.read()
1181
+
1182
+ return {
1183
+ "success": True,
1184
+ "data": share_data,
1185
+ "share_idx": chunk_info["share_idx"],
1186
+ "name": chunk_info["name"],
1187
+ }
1188
+ except Exception as e:
1189
+ if verbose:
1190
+ print(
1191
+ f"Error downloading chunk {chunk_info['name']}: {str(e)}"
1192
+ )
1193
+ return {
1194
+ "success": False,
1195
+ "error": str(e),
1196
+ "name": chunk_info["name"],
1197
+ }
1198
+
1199
+ # Create task
1200
+ task = asyncio.create_task(
1201
+ download_chunk(chunk_cid, chunk_path, chunk)
1202
+ )
1203
+ download_tasks.append(task)
1204
+
1205
+ # Process downloads as they complete
1206
+ downloaded_shares = []
1207
+ share_indexes = []
1208
+
1209
+ for done_task in asyncio.as_completed(download_tasks):
1210
+ result = await done_task
1211
+
1212
+ if result["success"]:
1213
+ downloaded_shares.append(result["data"])
1214
+ share_indexes.append(result["share_idx"])
1215
+
1216
+ # Once we have k chunks, cancel remaining downloads
1217
+ if len(downloaded_shares) >= k:
1218
+ for task in download_tasks:
1219
+ if not task.done():
1220
+ task.cancel()
1221
+ break
1222
+
1223
+ # Check if we have enough chunks
1224
+ if len(downloaded_shares) < k:
1225
+ raise ValueError(
1226
+ f"Failed to download enough chunks for original chunk {orig_idx}. "
1227
+ f"Need {k}, but only downloaded {len(downloaded_shares)}."
1228
+ )
1099
1229
 
1100
- downloaded_shares.append(share_data)
1101
- share_indexes.append(chunk["share_idx"])
1230
+ # Reconstruct this chunk
1231
+ decoder = zfec.Decoder(k, m)
1232
+ reconstructed_data = decoder.decode(
1233
+ downloaded_shares, share_indexes
1234
+ )
1102
1235
 
1103
- except Exception as e:
1104
- if verbose:
1105
- print(f"Error downloading chunk {chunk['name']}: {str(e)}")
1106
- chunks_failed += 1
1107
- # Continue to the next chunk
1236
+ if not isinstance(reconstructed_data, list):
1237
+ raise TypeError(
1238
+ f"Unexpected type from decoder: {type(reconstructed_data)}. Expected list of bytes."
1239
+ )
1108
1240
 
1109
- # If we don't have enough chunks, try to download more
1110
- if len(downloaded_shares) < k:
1111
- raise ValueError(
1112
- f"Failed to download enough chunks for original chunk {orig_idx}. "
1113
- f"Need {k}, but only downloaded {len(downloaded_shares)}."
1114
- )
1241
+ # Calculate the actual size of this original chunk
1242
+ is_last_chunk = orig_idx == max(chunks_by_original.keys())
1243
+ original_chunk_size = total_original_size - orig_idx * chunk_size
1244
+ if not is_last_chunk:
1245
+ original_chunk_size = min(chunk_size, original_chunk_size)
1246
+
1247
+ # Recombine the sub-blocks
1248
+ reconstructed_chunk = b""
1249
+ total_bytes = 0
1250
+ for sub_block in reconstructed_data:
1251
+ bytes_to_take = min(
1252
+ len(sub_block), original_chunk_size - total_bytes
1253
+ )
1254
+ if bytes_to_take <= 0:
1255
+ break
1115
1256
 
1116
- # Reconstruct this chunk
1117
- decoder = zfec.Decoder(k, m)
1118
- reconstructed_data = decoder.decode(downloaded_shares, share_indexes)
1257
+ reconstructed_chunk += sub_block[:bytes_to_take]
1258
+ total_bytes += bytes_to_take
1119
1259
 
1120
- # If we used the sub-block approach during encoding, we need to recombine the sub-blocks
1121
- if isinstance(reconstructed_data, list):
1122
- # Combine the sub-blocks back into a single chunk
1123
- reconstructed_chunk = b"".join(reconstructed_data)
1124
- else:
1125
- # The simple case where we didn't use sub-blocks
1126
- reconstructed_chunk = reconstructed_data
1260
+ return reconstructed_chunk
1127
1261
 
1128
- reconstructed_chunks.append(reconstructed_chunk)
1262
+ # Create tasks for all original chunks and process them in parallel
1263
+ chunk_tasks = []
1264
+ for orig_idx in sorted(chunks_by_original.keys()):
1265
+ chunk_tasks.append(
1266
+ process_original_chunk(orig_idx, chunks_by_original[orig_idx])
1267
+ )
1129
1268
 
1130
- # Print progress
1131
- if verbose:
1132
- progress_pct = (orig_idx + 1) / total_original_chunks * 100
1133
- print(
1134
- f" Progress: {orig_idx + 1}/{total_original_chunks} chunks ({progress_pct:.1f}%)"
1135
- )
1269
+ # Wait for all chunks to be reconstructed
1270
+ reconstructed_chunks = await asyncio.gather(*chunk_tasks)
1136
1271
 
1137
1272
  if verbose:
1138
1273
  download_time = time.time() - start_time
1139
- print(
1140
- f"Downloaded {chunks_downloaded} chunks in {download_time:.2f} seconds"
1141
- )
1142
- if chunks_failed > 0:
1143
- print(
1144
- f"Failed to download {chunks_failed} chunks (not needed for reconstruction)"
1145
- )
1274
+ print(f"Chunk reconstruction completed in {download_time:.2f} seconds")
1146
1275
 
1147
1276
  # Step 5: Combine the reconstructed chunks into a file
1148
- if verbose:
1149
- print("Combining reconstructed chunks...")
1277
+ print("Combining reconstructed chunks...")
1278
+
1279
+ # Process chunks to remove padding correctly
1280
+ processed_chunks = []
1281
+ size_processed = 0
1282
+
1283
+ for i, chunk in enumerate(reconstructed_chunks):
1284
+ # For all chunks except the last one, use full chunk size
1285
+ if i < len(reconstructed_chunks) - 1:
1286
+ # Calculate how much of this chunk should be used (handle full chunks)
1287
+ chunk_valid_bytes = min(
1288
+ chunk_size, total_original_size - size_processed
1289
+ )
1290
+ processed_chunks.append(chunk[:chunk_valid_bytes])
1291
+ size_processed += chunk_valid_bytes
1292
+ else:
1293
+ # For the last chunk, calculate the remaining bytes needed
1294
+ remaining_bytes = total_original_size - size_processed
1295
+ processed_chunks.append(chunk[:remaining_bytes])
1296
+ size_processed += remaining_bytes
1150
1297
 
1151
- # Concatenate all chunks
1152
- file_data = b"".join(reconstructed_chunks)
1298
+ # Concatenate all processed chunks
1299
+ file_data = b"".join(processed_chunks)
1153
1300
 
1154
- # Remove padding from the last chunk
1155
- if original_file["size"] < len(file_data):
1156
- file_data = file_data[: original_file["size"]]
1301
+ # Double-check the final size matches the original
1302
+ if len(file_data) != original_file["size"]:
1303
+ print(
1304
+ f"Warning: Reconstructed size ({len(file_data)}) differs from original ({original_file['size']})"
1305
+ )
1306
+ # Ensure we have exactly the right size
1307
+ if len(file_data) > original_file["size"]:
1308
+ file_data = file_data[: original_file["size"]]
1309
+ else:
1310
+ # If we're short, pad with zeros (shouldn't happen with proper reconstruction)
1311
+ print(
1312
+ "Warning: Reconstructed file is smaller than original, padding with zeros"
1313
+ )
1314
+ file_data += b"\0" * (original_file["size"] - len(file_data))
1157
1315
 
1158
1316
  # Step 6: Decrypt if necessary
1159
1317
  if is_encrypted:
@@ -1181,7 +1339,7 @@ class IPFSClient:
1181
1339
  print("Warning: File hash mismatch!")
1182
1340
  print(f" Expected: {expected_hash}")
1183
1341
  print(f" Actual: {actual_hash}")
1184
- elif verbose:
1342
+ else:
1185
1343
  print("Hash verification successful!")
1186
1344
 
1187
1345
  total_time = time.time() - start_time