@semiont/content 0.2.28 → 0.2.29-build.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +184 -42
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
# @semiont/content
|
|
2
2
|
|
|
3
|
+
[](https://www.npmjs.com/package/@semiont/content)
|
|
3
4
|
[](https://github.com/The-AI-Alliance/semiont/actions/workflows/package-tests.yml?query=branch%3Amain+is%3Asuccess+job%3A%22Test+content%22)
|
|
4
5
|
|
|
5
|
-
Content-addressed storage
|
|
6
|
+
Content-addressed storage using SHA-256 checksums with automatic deduplication and W3C compliance.
|
|
6
7
|
|
|
7
8
|
## Installation
|
|
8
9
|
|
|
@@ -23,85 +24,226 @@ const store = new FilesystemRepresentationStore({
|
|
|
23
24
|
const content = Buffer.from('Hello, World!');
|
|
24
25
|
const stored = await store.store(content, {
|
|
25
26
|
mediaType: 'text/plain',
|
|
27
|
+
language: 'en',
|
|
26
28
|
rel: 'original'
|
|
27
29
|
});
|
|
28
30
|
|
|
31
|
+
console.log(stored.checksum); // sha256:abc123...
|
|
32
|
+
|
|
29
33
|
// Retrieve by checksum
|
|
30
34
|
const retrieved = await store.retrieve(stored.checksum, 'text/plain');
|
|
35
|
+
console.log(retrieved.toString()); // "Hello, World!"
|
|
36
|
+
|
|
37
|
+
// Same content = same checksum (deduplication)
|
|
38
|
+
const duplicate = await store.store(content, {
|
|
39
|
+
mediaType: 'text/plain',
|
|
40
|
+
rel: 'copy'
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
console.log(duplicate.checksum === stored.checksum); // true
|
|
31
44
|
```
|
|
32
45
|
|
|
33
|
-
|
|
46
|
+
## Features
|
|
47
|
+
|
|
48
|
+
- 🔐 **Content-Addressed** - SHA-256 checksum as identifier
|
|
49
|
+
- 🎯 **Automatic Deduplication** - Identical content stored once
|
|
50
|
+
- 🗂️ **Smart Sharding** - 65,536 directories for scalability
|
|
51
|
+
- 📊 **W3C Compliant** - Full representation metadata support
|
|
52
|
+
- 🏷️ **MIME Type Support** - 80+ types with proper extensions
|
|
53
|
+
- 🌍 **Multilingual** - Language and encoding metadata
|
|
54
|
+
|
|
55
|
+
## Documentation
|
|
56
|
+
|
|
57
|
+
- [API Reference](./docs/API.md) - Complete API documentation
|
|
58
|
+
- [Architecture](./docs/ARCHITECTURE.md) - Design principles
|
|
59
|
+
- [Patterns](./docs/PATTERNS.md) - Usage patterns and best practices
|
|
60
|
+
|
|
61
|
+
## Examples
|
|
62
|
+
|
|
63
|
+
- [Basic Example](./examples/basic.ts) - Storage and retrieval
|
|
64
|
+
- [Deduplication](./examples/deduplication.ts) - Content addressing benefits
|
|
65
|
+
- [Binary Content](./examples/binary.ts) - Images and documents
|
|
66
|
+
|
|
67
|
+
## Storage Architecture
|
|
68
|
+
|
|
69
|
+
### Content Addressing
|
|
34
70
|
|
|
35
|
-
|
|
71
|
+
Every piece of content is addressed by its SHA-256 checksum:
|
|
36
72
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
- **MIME Type Support**: 80+ types with proper file extensions
|
|
42
|
-
- **Character Encoding**: Preserves charset in metadata
|
|
73
|
+
```typescript
|
|
74
|
+
const checksum = calculateChecksum(content);
|
|
75
|
+
// sha256:5aaa0b72c1f4d8e7a9f2c8b3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0c1d2e3
|
|
76
|
+
```
|
|
43
77
|
|
|
44
|
-
|
|
78
|
+
### Storage Path Structure
|
|
45
79
|
|
|
46
80
|
```
|
|
47
|
-
basePath/
|
|
81
|
+
basePath/
|
|
82
|
+
└── representations/
|
|
83
|
+
└── {mediaType}/ # URL-encoded MIME type
|
|
84
|
+
└── {ab}/ # First 2 hex chars of checksum
|
|
85
|
+
└── {cd}/ # Next 2 hex chars (65,536 shards)
|
|
86
|
+
└── rep-{checksum}.{ext}
|
|
48
87
|
```
|
|
49
88
|
|
|
50
|
-
Example:
|
|
89
|
+
Example paths:
|
|
90
|
+
```
|
|
91
|
+
representations/text~1plain/5a/aa/rep-5aaa0b72...abc.txt
|
|
92
|
+
representations/image~1png/ff/12/rep-ff123456...def.png
|
|
93
|
+
representations/application~1json/ab/cd/rep-abcd1234...123.json
|
|
94
|
+
```
|
|
51
95
|
|
|
52
|
-
|
|
96
|
+
### Deduplication
|
|
53
97
|
|
|
54
|
-
|
|
98
|
+
Content-addressed storage provides automatic deduplication:
|
|
99
|
+
|
|
100
|
+
```typescript
|
|
101
|
+
// Store same content 100 times
|
|
102
|
+
for (let i = 0; i < 100; i++) {
|
|
103
|
+
await store.store(identicalContent, metadata);
|
|
104
|
+
}
|
|
105
|
+
// Result: Only ONE file on disk
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## API Overview
|
|
55
109
|
|
|
56
110
|
### FilesystemRepresentationStore
|
|
57
111
|
|
|
58
112
|
```typescript
|
|
59
|
-
new FilesystemRepresentationStore(
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
113
|
+
const store = new FilesystemRepresentationStore({
|
|
114
|
+
basePath: '/data/storage' // Root storage directory
|
|
115
|
+
});
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Store Content
|
|
119
|
+
|
|
120
|
+
```typescript
|
|
121
|
+
const stored = await store.store(
|
|
122
|
+
content: Buffer,
|
|
123
|
+
metadata: {
|
|
124
|
+
mediaType: string; // Required: MIME type
|
|
125
|
+
filename?: string; // Optional: Original filename
|
|
126
|
+
encoding?: string; // Optional: Character encoding
|
|
127
|
+
language?: string; // Optional: ISO language code
|
|
128
|
+
rel?: string; // Optional: Relationship type
|
|
129
|
+
}
|
|
130
|
+
): Promise<StoredRepresentation>
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Retrieve Content
|
|
63
134
|
|
|
64
|
-
|
|
65
|
-
|
|
135
|
+
```typescript
|
|
136
|
+
const buffer = await store.retrieve(
|
|
137
|
+
checksum: string, // SHA-256 checksum
|
|
138
|
+
mediaType: string // MIME type for path lookup
|
|
139
|
+
): Promise<Buffer>
|
|
66
140
|
```
|
|
67
141
|
|
|
68
142
|
### Types
|
|
69
143
|
|
|
70
144
|
```typescript
|
|
71
|
-
interface
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
145
|
+
interface StoredRepresentation {
|
|
146
|
+
'@id': string; // Content URI
|
|
147
|
+
checksum: string; // SHA-256 hex (64 chars)
|
|
148
|
+
byteSize: number; // Content size in bytes
|
|
149
|
+
mediaType: string; // MIME type
|
|
150
|
+
created: string; // ISO 8601 timestamp
|
|
151
|
+
language?: string; // ISO language code
|
|
152
|
+
encoding?: string; // Character encoding
|
|
153
|
+
rel?: string; // Relationship type
|
|
77
154
|
}
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Supported MIME Types
|
|
158
|
+
|
|
159
|
+
The package includes 80+ MIME type mappings:
|
|
160
|
+
|
|
161
|
+
| Type | Extensions | Example |
|
|
162
|
+
|------|-----------|---------|
|
|
163
|
+
| Text | `.txt`, `.md`, `.html`, `.csv` | `text/plain` → `.txt` |
|
|
164
|
+
| Documents | `.pdf`, `.doc`, `.docx` | `application/pdf` → `.pdf` |
|
|
165
|
+
| Images | `.png`, `.jpg`, `.gif`, `.webp` | `image/png` → `.png` |
|
|
166
|
+
| Audio | `.mp3`, `.wav`, `.ogg` | `audio/mpeg` → `.mp3` |
|
|
167
|
+
| Video | `.mp4`, `.webm`, `.mov` | `video/mp4` → `.mp4` |
|
|
168
|
+
| Code | `.js`, `.ts`, `.py`, `.java` | `text/javascript` → `.js` |
|
|
169
|
+
| Data | `.json`, `.xml`, `.yaml` | `application/json` → `.json` |
|
|
170
|
+
|
|
171
|
+
Unknown types default to `.dat` extension.
|
|
172
|
+
|
|
173
|
+
## W3C Compliance
|
|
174
|
+
|
|
175
|
+
Full support for W3C representation metadata:
|
|
176
|
+
|
|
177
|
+
```typescript
|
|
178
|
+
const stored = await store.store(content, {
|
|
179
|
+
mediaType: 'text/html',
|
|
180
|
+
language: 'en-US',
|
|
181
|
+
encoding: 'UTF-8',
|
|
182
|
+
rel: 'original'
|
|
183
|
+
});
|
|
78
184
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
185
|
+
// W3C-compliant metadata
|
|
186
|
+
{
|
|
187
|
+
"@id": "urn:sha256:abc123...",
|
|
188
|
+
"@type": "Representation",
|
|
189
|
+
"checksum": "sha256:abc123...",
|
|
190
|
+
"mediaType": "text/html",
|
|
191
|
+
"language": "en-US",
|
|
192
|
+
"encoding": "UTF-8",
|
|
193
|
+
"rel": "original",
|
|
194
|
+
"byteSize": 1234,
|
|
195
|
+
"created": "2024-01-01T00:00:00Z"
|
|
84
196
|
}
|
|
85
197
|
```
|
|
86
198
|
|
|
87
|
-
|
|
199
|
+
## Performance
|
|
200
|
+
|
|
201
|
+
- **SHA-256 Calculation**: ~500 MB/s on modern CPUs
|
|
202
|
+
- **Write Performance**: Limited by filesystem (typically ~100 MB/s)
|
|
203
|
+
- **Read Performance**: O(1) direct path lookup
|
|
204
|
+
- **Sharding**: 65,536 directories prevent filesystem bottlenecks
|
|
205
|
+
- **Deduplication**: 100% space savings for duplicate content
|
|
206
|
+
|
|
207
|
+
## Best Practices
|
|
208
|
+
|
|
209
|
+
1. **Use Buffers**: Always pass content as Buffer for binary safety
|
|
210
|
+
2. **Specify MIME Types**: Required for proper file extensions
|
|
211
|
+
3. **Add Language Metadata**: Important for multilingual content
|
|
212
|
+
4. **Handle Missing Content**: Check existence before retrieval
|
|
213
|
+
5. **Monitor Storage**: Track disk usage and shard distribution
|
|
88
214
|
|
|
89
|
-
|
|
215
|
+
## Error Handling
|
|
90
216
|
|
|
91
217
|
```typescript
|
|
92
|
-
|
|
93
|
-
|
|
218
|
+
try {
|
|
219
|
+
const retrieved = await store.retrieve(checksum, mediaType);
|
|
220
|
+
} catch (error) {
|
|
221
|
+
if (error.code === 'ENOENT') {
|
|
222
|
+
// Content not found
|
|
223
|
+
} else if (error.code === 'EACCES') {
|
|
224
|
+
// Permission denied
|
|
225
|
+
} else {
|
|
226
|
+
// Other filesystem error
|
|
227
|
+
}
|
|
228
|
+
}
|
|
94
229
|
```
|
|
95
230
|
|
|
96
|
-
|
|
231
|
+
## Development
|
|
97
232
|
|
|
98
|
-
|
|
233
|
+
```bash
|
|
234
|
+
# Install dependencies
|
|
235
|
+
npm install
|
|
99
236
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
237
|
+
# Build package
|
|
238
|
+
npm run build
|
|
239
|
+
|
|
240
|
+
# Run tests
|
|
241
|
+
npm test
|
|
242
|
+
|
|
243
|
+
# Type checking
|
|
244
|
+
npm run typecheck
|
|
245
|
+
```
|
|
104
246
|
|
|
105
247
|
## License
|
|
106
248
|
|
|
107
|
-
Apache-2.0
|
|
249
|
+
Apache-2.0
|