@fluidframework/merge-tree 2.0.0-dev.4.3.0.159619 → 2.0.0-dev.4.4.0.161322
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +13 -13
- package/.vscode/launch.json +0 -16
- package/docs/Attribution.md +0 -382
- package/docs/DEV.md +0 -22
- package/docs/Obliterate.md +0 -647
- package/docs/REFERENCEPOSITIONS.md +0 -199
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fluidframework/merge-tree",
|
|
3
|
-
"version": "2.0.0-dev.4.
|
|
3
|
+
"version": "2.0.0-dev.4.4.0.161322",
|
|
4
4
|
"description": "Merge tree",
|
|
5
5
|
"homepage": "https://fluidframework.com",
|
|
6
6
|
"repository": {
|
|
@@ -37,27 +37,27 @@
|
|
|
37
37
|
"dependencies": {
|
|
38
38
|
"@fluidframework/common-definitions": "^0.20.1",
|
|
39
39
|
"@fluidframework/common-utils": "^1.1.1",
|
|
40
|
-
"@fluidframework/container-definitions": "2.0.0-dev.4.
|
|
41
|
-
"@fluidframework/container-utils": "2.0.0-dev.4.
|
|
42
|
-
"@fluidframework/core-interfaces": "2.0.0-dev.4.
|
|
43
|
-
"@fluidframework/datastore-definitions": "2.0.0-dev.4.
|
|
40
|
+
"@fluidframework/container-definitions": "2.0.0-dev.4.4.0.161322",
|
|
41
|
+
"@fluidframework/container-utils": "2.0.0-dev.4.4.0.161322",
|
|
42
|
+
"@fluidframework/core-interfaces": "2.0.0-dev.4.4.0.161322",
|
|
43
|
+
"@fluidframework/datastore-definitions": "2.0.0-dev.4.4.0.161322",
|
|
44
44
|
"@fluidframework/protocol-definitions": "^1.1.0",
|
|
45
|
-
"@fluidframework/runtime-definitions": "2.0.0-dev.4.
|
|
46
|
-
"@fluidframework/runtime-utils": "2.0.0-dev.4.
|
|
47
|
-
"@fluidframework/shared-object-base": "2.0.0-dev.4.
|
|
48
|
-
"@fluidframework/telemetry-utils": "2.0.0-dev.4.
|
|
45
|
+
"@fluidframework/runtime-definitions": "2.0.0-dev.4.4.0.161322",
|
|
46
|
+
"@fluidframework/runtime-utils": "2.0.0-dev.4.4.0.161322",
|
|
47
|
+
"@fluidframework/shared-object-base": "2.0.0-dev.4.4.0.161322",
|
|
48
|
+
"@fluidframework/telemetry-utils": "2.0.0-dev.4.4.0.161322"
|
|
49
49
|
},
|
|
50
50
|
"devDependencies": {
|
|
51
|
-
"@fluid-internal/stochastic-test-utils": "2.0.0-dev.4.
|
|
52
|
-
"@fluid-internal/test-pairwise-generator": "2.0.0-dev.4.
|
|
51
|
+
"@fluid-internal/stochastic-test-utils": "2.0.0-dev.4.4.0.161322",
|
|
52
|
+
"@fluid-internal/test-pairwise-generator": "2.0.0-dev.4.4.0.161322",
|
|
53
53
|
"@fluid-tools/benchmark": "^0.47.0",
|
|
54
54
|
"@fluid-tools/build-cli": "^0.17.0",
|
|
55
55
|
"@fluidframework/build-common": "^1.1.0",
|
|
56
56
|
"@fluidframework/build-tools": "^0.17.0",
|
|
57
57
|
"@fluidframework/eslint-config-fluid": "^2.0.0",
|
|
58
58
|
"@fluidframework/merge-tree-previous": "npm:@fluidframework/merge-tree@2.0.0-internal.4.1.0",
|
|
59
|
-
"@fluidframework/mocha-test-setup": "2.0.0-dev.4.
|
|
60
|
-
"@fluidframework/test-runtime-utils": "2.0.0-dev.4.
|
|
59
|
+
"@fluidframework/mocha-test-setup": "2.0.0-dev.4.4.0.161322",
|
|
60
|
+
"@fluidframework/test-runtime-utils": "2.0.0-dev.4.4.0.161322",
|
|
61
61
|
"@microsoft/api-extractor": "^7.34.4",
|
|
62
62
|
"@types/diff": "^3.5.1",
|
|
63
63
|
"@types/mocha": "^9.1.1",
|
package/.vscode/launch.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
// Use IntelliSense to learn about possible attributes.
|
|
3
|
-
// Hover to view descriptions of existing attributes.
|
|
4
|
-
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
|
5
|
-
"version": "0.2.0",
|
|
6
|
-
"configurations": [
|
|
7
|
-
{
|
|
8
|
-
"name": "Merge Tree Test",
|
|
9
|
-
"type": "node",
|
|
10
|
-
"request": "launch",
|
|
11
|
-
"program": "${workspaceRoot}/../../../node_modules/mocha/bin/_mocha",
|
|
12
|
-
"args": ["dist/test", " --recursive", "--no-timeouts", "--exit"],
|
|
13
|
-
"cwd": "${workspaceRoot}"
|
|
14
|
-
}
|
|
15
|
-
]
|
|
16
|
-
}
|
package/docs/Attribution.md
DELETED
|
@@ -1,382 +0,0 @@
|
|
|
1
|
-
# Attribution
|
|
2
|
-
|
|
3
|
-
This design document covers a high-level plan for embedding attribution information into merge-tree.
|
|
4
|
-
It attempts to be detailed enough to start fleshing out proposed optimizations into code, though the actual factoring of the code
|
|
5
|
-
(responsibilities of objects, names/semantics) may be subject to change in further refinements of the design.
|
|
6
|
-
|
|
7
|
-
## Motivation
|
|
8
|
-
|
|
9
|
-
A common feature in collaborative applications is the ability to attribute pieces of content to a particular user.
|
|
10
|
-
This attribution information generally contains information about who edited the content as well as when the edit occurred.
|
|
11
|
-
|
|
12
|
-
At the time of writing this document, the Fluid Framework doesn't natively support this kind of functionality,
|
|
13
|
-
though in theory it has all of the data it needs (the op envelope contains both a timestamp and a client id, which can
|
|
14
|
-
be mapped to information about the user using the audience).
|
|
15
|
-
This has forced Fluid consumers that want attribution information to use workaround schemes. For example, in SharedString it's
|
|
16
|
-
straightforward to conceptualize a scheme where each time a client submits an op that edits the string, it waits for that op to
|
|
17
|
-
ack and uses the timestamp on that op to submit an additional op that annotates the edited segments with attribution information.
|
|
18
|
-
|
|
19
|
-
Besides unnecessarily complicating client code, this has several drawbacks:
|
|
20
|
-
|
|
21
|
-
- It is noisy on the wire
|
|
22
|
-
- Attribution information can be lost in various cases if the submitting client disconnects
|
|
23
|
-
- In-memory and snapshot size for the SharedString is more bloated than it should be; without binning the timestamps this strategy
|
|
24
|
-
entirely invalidated the zamboni scheme, and even if the timestamps are binned this will unnecessarily include the same user info
|
|
25
|
-
many times on different segments
|
|
26
|
-
|
|
27
|
-
Rather than force this burden on consumers, it makes more sense to bake some attribution capability into the Fluid Framework in an opt-in way.
|
|
28
|
-
Though this document will cover an approach for doing so in merge-tree (primarily targeted at support for attribution in SharedString),
|
|
29
|
-
none of the above concerns are specific to a single DDS.
|
|
30
|
-
It's imaginible that Fluid will eventually want to generalize this to a platform mechanism that's supported by each DDS that wants to opt in to it.
|
|
31
|
-
For that reason, the design is aimed to modularize into areas that are generic to the container runtime and those that are DDS-specific.
|
|
32
|
-
|
|
33
|
-
## High-level
|
|
34
|
-
|
|
35
|
-
If one had access to the entire op stream, a lookup from all historical client ids to their user info,
|
|
36
|
-
and every DDS retained information about which sequence number created/modified each part of its data,
|
|
37
|
-
attribution would be straightforward. Ask the DDS for the relevant sequence number, then look at this sequence number's op for a timestamp + clientId
|
|
38
|
-
and use the client id to look up user information.
|
|
39
|
-
|
|
40
|
-
All of this information is knowable from the Fluid runtime perspective, though not all of it is persisted indefinitely.
|
|
41
|
-
Notably:
|
|
42
|
-
|
|
43
|
-
- Access to the entire op stream is an unreasonable assumption due to the summarization process
|
|
44
|
-
- User information is only accessible for connected clients
|
|
45
|
-
|
|
46
|
-
However, this conceptualization of attribution does suggest a reasonable split of concerns that can be individually assessed:
|
|
47
|
-
none of the association between sequence numbers, timestamps, clientIds, and user information is specific to any given DDS.
|
|
48
|
-
Thus, all of this bookkeeping could be generically done by the framework (potential candidates include on container runtime, data store runtime, or channel context),
|
|
49
|
-
and any query-style APIs a DDS might support for retrieving attribution information could be accomplished by asking the runtime for information about a given sequence number.
|
|
50
|
-
|
|
51
|
-
This leaves two high-level problems:
|
|
52
|
-
|
|
53
|
-
1. How can the framework manage to associate sequence numbers to attribution information efficiently?
|
|
54
|
-
2. What degrees of freedom should merge-tree expose for attributing its state to different users?
|
|
55
|
-
|
|
56
|
-
## Sequence Number to Attribution Association
|
|
57
|
-
|
|
58
|
-
Setting aside the problem of where to put the state for now, there are two primary ways by which associations between sequence numbers and attribution
|
|
59
|
-
can be made practical from a memory perspective.
|
|
60
|
-
First, there needs to be a garbage collection scheme to clean up attribution information on removed content.
|
|
61
|
-
|
|
62
|
-
Secondly, attribution information needs to be compacted to an efficient format, both in terms of snapshot size (i.e. plain data representation) and desired
|
|
63
|
-
level of granularity (applications don't care about millisecond-accurate timestamps).
|
|
64
|
-
|
|
65
|
-
Finally, the in-memory data structures that support the necessary APIs are discussed.
|
|
66
|
-
|
|
67
|
-
### Cleanup of outdated information
|
|
68
|
-
|
|
69
|
-
Since the semantics of each op is opaque to the runtime, the runtime needs some mechanism to ascertain when an op's attribution is no longer relevant, i.e.
|
|
70
|
-
not referenced.
|
|
71
|
-
|
|
72
|
-
There are a few general models that could work:
|
|
73
|
-
|
|
74
|
-
1. Assume that the runtime controls authoring of references to attribution information. It could stamp such information with a unique symbol such that it could
|
|
75
|
-
later be recognized in serialization to determine if the info was still referenced.
|
|
76
|
-
This approach is not far off from how `IFluidHandle`s work.
|
|
77
|
-
Reference counting the created objects could also work, but would likely be messier (responsibility of cleanup will likely end up extending past where we want it).
|
|
78
|
-
2. Demand objects that store attribution information implement a function that exposes all sequence numbers they reference.
|
|
79
|
-
|
|
80
|
-
Option 1 might look something like
|
|
81
|
-
|
|
82
|
-
```typescript
|
|
83
|
-
const attributionHandle = Symbol("attribution handle");
|
|
84
|
-
|
|
85
|
-
class /*Container/DataStore/etc. (TBD)*/ Runtime {
|
|
86
|
-
public createAttributionHandle(sequenceNumber: number) {
|
|
87
|
-
return {
|
|
88
|
-
[attributionHandle]: true,
|
|
89
|
-
sequenceNumber,
|
|
90
|
-
};
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
// Serialization logic in ISerializer would need to look for attributionHandle symbol usages
|
|
95
|
-
// and serialize it appropriately.
|
|
96
|
-
// Similarly for deserialization.
|
|
97
|
-
// At summary time, the set of sequence numbers that were referenced can be recorded for each data store,
|
|
98
|
-
// and any sequence numbers that are no longer referenced could have their attribution information cleaned up.
|
|
99
|
-
// Incremental summaries make the bookkeeping of this scheme slightly more complicated, but the general idea
|
|
100
|
-
// still works.
|
|
101
|
-
```
|
|
102
|
-
|
|
103
|
-
The main advantage of option 1 is that it requires less DDS/application code.
|
|
104
|
-
However, it causes larger-sized snapshots, since the serialized form of runtime-minted attribution handles will be more verbose than a simple number.
|
|
105
|
-
It also leads to a potentially nasty bug pit: there's not a practical way to enforce that objects storing attribution information actually call
|
|
106
|
-
`createAttributionHandle` before serializing their data: they could just as easily store the sequence number and only call `createAttributionHandle`
|
|
107
|
-
directly before trying to obtain attribution information.
|
|
108
|
-
This would risk attribution information getting GC'd too early.
|
|
109
|
-
|
|
110
|
-
Option 2 would look closer to this:
|
|
111
|
-
|
|
112
|
-
```typescript
|
|
113
|
-
interface IReferenceAttributionInfo {
|
|
114
|
-
/**
|
|
115
|
-
* @returns an iterable over all sequence numbers for which this object references attribution information.
|
|
116
|
-
*/
|
|
117
|
-
getReferencedSeqs(): Iterable<number>;
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
class MergeTree implements IReferenceAttributionInfo {
|
|
121
|
-
public getReferencedSeqs() {
|
|
122
|
-
const seqs = new Set();
|
|
123
|
-
this.walkAllSegments(this.root, (seg) => {
|
|
124
|
-
seqs.add(seg.seq);
|
|
125
|
-
});
|
|
126
|
-
return seqs;
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
Though this design forces extra code on users, it's typically not conceptually difficult to implement and enables the serialized format to be more compact.
|
|
132
|
-
|
|
133
|
-
It's worth noting that both of these models have interesting interactions with partial checkouts / schemes for more incremental summarization at the DDS level (via blob re-use [see #832](https://dev.azure.com/fluidframework/internal/_workitems/edit/832)): each would need to support some notion of reference count deltas from the previous result.
|
|
134
|
-
|
|
135
|
-
### Compaction of similar information
|
|
136
|
-
|
|
137
|
-
One primary motivation for supporting attribution information natively in the framework is the potential to reduce redundant attribution information in snapshots.
|
|
138
|
-
There are two obvious ways data is redundant: user information gets repeatedly inlined into `JSON.stringify`d content, and various sets of ops all likely have
|
|
139
|
-
virtually the same attribution information (same user, perhaps a slightly different timestamp).
|
|
140
|
-
Ops that have closer together sequence number are more likely to contain such redundant information, as users tend to edit documents in bursts.
|
|
141
|
-
This suggests a few strategies for keeping a compact format (either only on serialization or in-memory as well):
|
|
142
|
-
|
|
143
|
-
1. Intern user objects
|
|
144
|
-
2. Intern attribution objects
|
|
145
|
-
3. If a range of sequence numbers all have the same attribution information, store it as such
|
|
146
|
-
4. Allow "equivalent timestamp" policy injection: it's unlikely any app needs millisecond or better accuracy on the server ack timestamp for attribution purposes.
|
|
147
|
-
There should be a configurable policy for how timestamps get binned. Basic implementations could bin on a fixed cadence, but for even more compact files a dynamic bin size policy with larger bins for less recent data could also give a reasonable user experience
|
|
148
|
-
|
|
149
|
-
Optimizations 1 through 3 are all things that standard compression algorithms can detect: interning objects is essentially
|
|
150
|
-
[dictionary compression](https://en.wikipedia.org/wiki/Dictionary_coder) and compressing adjacent ranges is
|
|
151
|
-
[run-length encoding](https://en.wikipedia.org/wiki/Run-length_encoding), so before going through the trouble of writing bespoke compression code
|
|
152
|
-
we should experiment with things like [LZ4](<https://en.wikipedia.org/wiki/LZ4_(compression_algorithm)>) and [DEFLATE](https://en.wikipedia.org/wiki/Deflate).
|
|
153
|
-
For the purposes of illustration, the following sections will outline how the bespoke code might look.
|
|
154
|
-
|
|
155
|
-
#### Interning
|
|
156
|
-
|
|
157
|
-
Rather than repeatedly serialize the same information in the snapshot format, we can internally add a level of indirection to the `user` field, the entire
|
|
158
|
-
attribution object, or both.
|
|
159
|
-
This optimization would be entirely transparent to the public API: whatever snapshot/in-memory format we use, we'd always convert to `AttributionInfo` before
|
|
160
|
-
returning the information for a given seq to the DDS/application.
|
|
161
|
-
|
|
162
|
-
Interfaces might look like this, with exported properties being those visible to an application:
|
|
163
|
-
|
|
164
|
-
```typescript
|
|
165
|
-
export interface AttributionInfo {
|
|
166
|
-
user: IUser;
|
|
167
|
-
timestamp: number;
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
export interface IAttributor {
|
|
171
|
-
getAttributionInfo(seq: number): AttributionInfo;
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
type InternedRef = number & { readonly InternedRef: "e86840d8-8384-450c-b0e3-9a2855ba2d21" };
|
|
175
|
-
|
|
176
|
-
interface ObjectInterner {
|
|
177
|
-
getOrCreateRef(obj: Jsonable): InternedRef;
|
|
178
|
-
getObject(id: InternedRef): Jsonable;
|
|
179
|
-
getSerializable(): Jsonable;
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
interface CompactAttributionInfo {
|
|
183
|
-
userRef: InternedRef;
|
|
184
|
-
timestamp: number;
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
// Concrete types for a particular `Attributor` implementation
|
|
188
|
-
interface SerializedAttributor {
|
|
189
|
-
interner: Jsonable /* result of calling getSerializable() on an ObjectInterner */;
|
|
190
|
-
lookup: {
|
|
191
|
-
[seq: number]: InternedRef /* to CompactAttributionInfo */ | CompactAttributionInfo;
|
|
192
|
-
};
|
|
193
|
-
}
|
|
194
|
-
```
|
|
195
|
-
|
|
196
|
-
#### Adjacent Range Coalescing
|
|
197
|
-
|
|
198
|
-
Typical documents will likely have a number of consecutive ops with the same attribution information.
|
|
199
|
-
This happens for a few reasons: users might make a number of edits in a short period of time (consider a user typing
|
|
200
|
-
out a new paragraph), and ops submitted by a single container are batched under some circumstances.
|
|
201
|
-
|
|
202
|
-
Rather than end up with a `SerializedAttributor` that resembles this:
|
|
203
|
-
|
|
204
|
-
```javascript
|
|
205
|
-
{
|
|
206
|
-
interner: [{ email: "john.doe@contoso.com", id: "f400ddf3-4d04-48e9-8783-4b1db8a45fc3" }, { user: 0, timestamp: 1661974200000 }],
|
|
207
|
-
lookup: {
|
|
208
|
-
50: 1,
|
|
209
|
-
51: 1,
|
|
210
|
-
52: 1,
|
|
211
|
-
53: 1,
|
|
212
|
-
54: 1
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
```
|
|
216
|
-
|
|
217
|
-
we could instead serialize the lookup table like so:
|
|
218
|
-
|
|
219
|
-
```javascript
|
|
220
|
-
{
|
|
221
|
-
interner: [{ email: "john.doe@contoso.com", id: "f400ddf3-4d04-48e9-8783-4b1db8a45fc3" }, { user: 0, timestamp: 1661974200000 }],
|
|
222
|
-
lookup: [{ key: [50, 54], value: 1 }]
|
|
223
|
-
}
|
|
224
|
-
```
|
|
225
|
-
|
|
226
|
-
Since objects are distinguishable from numbers, single-number ranges could just have a number key.
|
|
227
|
-
|
|
228
|
-
```typescript
|
|
229
|
-
interface AttributionEntry {
|
|
230
|
-
/**
|
|
231
|
-
* Either a single `seq` number for this attribution entry, or a consecutive range `[start, end]` (inclusive)
|
|
232
|
-
* of `seq` numbers which all have the same attribution information.
|
|
233
|
-
*/
|
|
234
|
-
k: number | [number, number];
|
|
235
|
-
v: InternedRef | CompactAttributionInfo;
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
// Concrete types for a particular `Attributor` implementation
|
|
239
|
-
interface SerializedAttributor {
|
|
240
|
-
interner: Jsonable /* result of calling getSerializable() on an ObjectInterner */;
|
|
241
|
-
lookup: AttributionEntry[];
|
|
242
|
-
}
|
|
243
|
-
```
|
|
244
|
-
|
|
245
|
-
#### Timestamp Binning
|
|
246
|
-
|
|
247
|
-
One key aspect of information compaction is the ability to bin the precise timestamps given by the server into more reasonable granularity levels for attribution.
|
|
248
|
-
Unlike the other optimizations to compact information, binning is lossy.
|
|
249
|
-
Binning can simply be dictated by a function that takes in a timestamp and returns a timestamp for the output bin.
|
|
250
|
-
Simple strategies like "bin every 5 minutes" are as simple as `(timestamp: number) => timestamp - (timestamp % (1000 * 60 * 5))`,
|
|
251
|
-
but allowing an arbitrary function here also empowers more advanced users to make partitions of timespace like "5-minute granularity up to a day ago, 1-day granularity up to a month ago, 1-month granularity up to a year ago, yearly granularity otherwise".
|
|
252
|
-
For the simple strategy, running the binning function on initial sequencing of the op would be sufficient.
|
|
253
|
-
To make the second function behave as desired ("old attribution information tends to get coalesced"),
|
|
254
|
-
the runtime would also have to re-bin existing attribution information either every so often or just on document load.
|
|
255
|
-
|
|
256
|
-
We should apply this optimization last, and only if we need it. It's possible standard time-series compression of numbers will be sufficient here.
|
|
257
|
-
|
|
258
|
-
### In-memory attribution structure
|
|
259
|
-
|
|
260
|
-
Attributor bookkeeping needs to efficiently support:
|
|
261
|
-
|
|
262
|
-
- Lookup of attribution information at a `seq`
|
|
263
|
-
- Adding attribution information for a newly sequenced op
|
|
264
|
-
- Merging consecutive attribution entries that should now be coalesced (depending on other design choices, this one is less important)
|
|
265
|
-
|
|
266
|
-
One candidate implementation would be to expand the serialized format entirely and use a `Map`. This implementation is viable, but uses
|
|
267
|
-
`O(attributed seq#s)` memory. It would provide `O(1)` lookup.
|
|
268
|
-
Another reasonable candidate would be to keep the overall structure of having coalesced adjacent ranges, putting the serialized form into a
|
|
269
|
-
sorted list that can be binary searched.
|
|
270
|
-
This would give a reasonable memory win at the cost of increasing lookup time to `O(log(attributed seq#s))`.
|
|
271
|
-
|
|
272
|
-
Putting all of the optimizations together, an `Attributor` implementation might look something like this:
|
|
273
|
-
|
|
274
|
-
```typescript
|
|
275
|
-
export interface IAttributor {
|
|
276
|
-
getAttributionInfo(seq: number): AttributionInfo;
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
export const binByMinutes = (interval: number) => (timestamp: number) => timestamp - (timestamp % (1000 * 60 * interval));
|
|
280
|
-
|
|
281
|
-
const seqComparator = (a: AttributionEntry, b: AttributionEntry) => {
|
|
282
|
-
aEnd = typeof a.k === 'number' ? a.k : a.k[1];
|
|
283
|
-
bEnd = typeof a.k === 'number' ? b.k : b.k[1];
|
|
284
|
-
return aEnd - bEnd;
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
class Attributor implements IAttributor {
|
|
288
|
-
private seqToInfo: SortedList<AttributionEntry> = new SortedList(seqComparator);
|
|
289
|
-
constructor(
|
|
290
|
-
runtime: IFluidDataStoreRuntime,
|
|
291
|
-
serialized?: SerializedAttributor,
|
|
292
|
-
bin: (timestamp: number) => number = binByMinutes(5)
|
|
293
|
-
) {
|
|
294
|
-
if (serialized) {
|
|
295
|
-
const interner = new ObjectInterner(serialized.interner);
|
|
296
|
-
// Note: this implementation doesn't coalesce re-binned attribution entries that are newly equivalent and adjacent.
|
|
297
|
-
this.seqToInfo.extend(...serialized.lookup.map(({ k, v: internedV }) => {
|
|
298
|
-
const { timestamp, userRef } = isInternedRef(maybeInternedV) ? interner.getObject(maybeInternedV) : maybeInternedV;
|
|
299
|
-
const v = {
|
|
300
|
-
timestamp: bin(timestamp),
|
|
301
|
-
user: interner.getObject(userRef)
|
|
302
|
-
};
|
|
303
|
-
return { k, v };
|
|
304
|
-
}));
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
const { deltaManager, audience } = runtime;
|
|
308
|
-
deltaManager.on("op", (message: ISequencedDocumentMessage) => {
|
|
309
|
-
const attributionInfo = {
|
|
310
|
-
/* note: for object interning to work, this needs to be a referentially equal user object. If that isn't provided by the
|
|
311
|
-
Fluid Framework, we probably would want a layer of caching here. For interning of overall attribution info objects,
|
|
312
|
-
we may want a similar cache. */
|
|
313
|
-
user: audience.get(message.clientId).user,
|
|
314
|
-
timestamp: bin(message.timestamp)
|
|
315
|
-
};
|
|
316
|
-
const { k, v } = seqToInfo.getAt(seqToInfo.length - 1);
|
|
317
|
-
const lastEntryStart = typeof k === 'number' ? k : k[0];
|
|
318
|
-
const lastEntryEnd = typeof k === 'number' ? k : k[1];
|
|
319
|
-
if (
|
|
320
|
-
attributionInfosAreEquivalent(attributionInfo, v) &&
|
|
321
|
-
// Note: this coalescing logic is somewhat unideal since no-ops break it.
|
|
322
|
-
message.seq === 1 + lastEntryEnd)
|
|
323
|
-
) {
|
|
324
|
-
this.seqToInfo.pop();
|
|
325
|
-
this.seqToInfo.insert({ k: [lastEntryStart, message.seq], v });
|
|
326
|
-
} else {
|
|
327
|
-
this.seqToInfo.insert({ k: message.seq, v: attributionInfo });
|
|
328
|
-
}
|
|
329
|
-
});
|
|
330
|
-
}
|
|
331
|
-
|
|
332
|
-
public getAttributionInfo(seq: number): AttributionInfo {
|
|
333
|
-
const { k, v } = seqToInfo.findAtOrAfter(seq);
|
|
334
|
-
assert(k === seq || (k.length === 2 && k[0] <= seq && seq <= k[1]));
|
|
335
|
-
return v;
|
|
336
|
-
}
|
|
337
|
-
|
|
338
|
-
// Unpictured:
|
|
339
|
-
// - serialization (not interesting; deserialization logic is pictured)
|
|
340
|
-
// - GC (there are several ways to hook this up, though one can check the data structure should support it in O(n))
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
```
|
|
344
|
-
|
|
345
|
-
### Bookkeeping Placement Considerations
|
|
346
|
-
|
|
347
|
-
There are several levels that the framework could choose to conceptually store "sequence number to attribution" information:
|
|
348
|
-
|
|
349
|
-
- Container Runtime
|
|
350
|
-
- Data store runtime
|
|
351
|
-
- DDS
|
|
352
|
-
|
|
353
|
-
The initial attributor implementation will likely be hooked up to only `SharedString` due to current feature asks of partner teams.
|
|
354
|
-
However, it's worth calling out that depending on which layer the runtime places the information, there are consequences with respect
|
|
355
|
-
to GC and how well information compacts.
|
|
356
|
-
|
|
357
|
-
For GC:
|
|
358
|
-
|
|
359
|
-
- Determining whether sequence numbers are referenced by any attribution information gets complicated slightly by incremental summarization if
|
|
360
|
-
information is stored on container runtime
|
|
361
|
-
|
|
362
|
-
For compaction:
|
|
363
|
-
|
|
364
|
-
- Compaction schemes potentially get worse for sequences of ops that alter different data stores if attribution information is stored at a
|
|
365
|
-
fine-grained level (e.g. DDS, Data store runtime).
|
|
366
|
-
|
|
367
|
-
## Merge-Tree Attribution API
|
|
368
|
-
|
|
369
|
-
TODO: This section will cover planned extension points for specifying attribution information on merge-tree.
|
|
370
|
-
|
|
371
|
-
My current thinking is something along the lines of the following:
|
|
372
|
-
|
|
373
|
-
Segments have a `attribution` field which is an opaque object to merge-tree, but splits/combines/impacts merge behavior a la tracking groups.
|
|
374
|
-
Users of merge-tree are empowered to inject policy into the `attribution` of the segment as they see fit.
|
|
375
|
-
The most basic policy which we should get for free would be to use `clientSeq` as the only tracked attribution state, which corresponds to
|
|
376
|
-
an application that only wants to track who inserted the segment and when they did it.
|
|
377
|
-
|
|
378
|
-
More advanced users could provide fancier json-serializable state objects such as `{ inserted: number, annotated: number }` and set up proper
|
|
379
|
-
semantics for those fields.
|
|
380
|
-
|
|
381
|
-
I need to think through if current merge-tree delta operation events are a sufficient entrypoint for managing such state, or if there's a nicer
|
|
382
|
-
way to encapsulate common desires.
|
package/docs/DEV.md
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
# Developer Notes
|
|
2
|
-
|
|
3
|
-
## Merge Tree
|
|
4
|
-
|
|
5
|
-
### Node lengths
|
|
6
|
-
|
|
7
|
-
If a function reports a node's length as undefined, it means the node has been removed from the perspective of the client and/or reference sequence number.
|
|
8
|
-
Alternately, if a functions reports a nodes length as 0 it means that node is not yet visible from the perspective client and/or reference sequence number.
|
|
9
|
-
Ths distinction is important, as a removed segment with undefined length may not exists on remote clients, as it could have already been zambonied.
|
|
10
|
-
However a not yet visible segment with 0 length may already exist, or will eventually exits on all clients.
|
|
11
|
-
These have implications for eventually consistent conflict resolution. Generally, we ignore removed segments, and special case invisible segments, like in the case
|
|
12
|
-
of conflicting insert as handled in the `breakTie` function
|
|
13
|
-
|
|
14
|
-
### Zamboni
|
|
15
|
-
|
|
16
|
-
Zamboni is the garbage collection process in the merge tree. As segment change due to inserts and deletes, we add them to a heap which keeps the segment with the lowest sequence number at the head. These segments drive the zamboni process which is also run on every change. The zamboni process peeks at the heap to determine if the head is below the min sequence, then the segment is eligible. The minimum sequence number is important here, as the minimum sequence number is a sequence seen by all clients, and all clients will specify their reference sequence number as above the minimum sequence number. This mean that no new operations can come in that reference anything at or below the minimum sequence number, so we are safe to clean up anything we would need to applying incoming. Eligible segments are collected, and then a few different operations are done, superficially, merge, remove, and tree rebalance. Zamboni is incremental, and only collects a constant number of segments at each change so as not to introduce performance issues.
|
|
17
|
-
|
|
18
|
-
Merge is done if two adjacent segments are of the same type like text, that type is mergable (markers are not), neither are deleted, and all the properties match. The merge process reduces the number of segments, which are leaf nodes of the merge tree. For instance a user may type `c`, `a`, and `t` with each character being it's own operation therefore segment. The user could then highlight that range, and set a property on on all the characters indicating that they are bold, `{bold: true}`. At some later point, these segments would move to the top of th heap, and their sequence numbers would move below the minium sequence number. At that point zamboni could take those individual segments, and merge the into a single segment, `cat` with the property `{bold: true}`
|
|
19
|
-
|
|
20
|
-
Remove is a bit simpler. On removal of a segment, we track it's removed sequence number. When the segment's removed sequence number drops below the minimum sequence number it can be safely removed from the tree.
|
|
21
|
-
|
|
22
|
-
Rebalance is a bit different from merge and remove, as it has to do with maintaining the tree itself. After merge or removal there are fewer segments aka leaf nodes in the tree. This allows us to more efficiently pack the non-leaf node of the tree, and potentially remove layers from the tree. This keeps the tree compact, which has both memory and cpu performance implications.
|
package/docs/Obliterate.md
DELETED
|
@@ -1,647 +0,0 @@
|
|
|
1
|
-
# Merge Tree Obliterate
|
|
2
|
-
|
|
3
|
-
This document covers motivation, spec, and design for the upcoming "obliterate" feature of merge-tree.
|
|
4
|
-
|
|
5
|
-
## Spec
|
|
6
|
-
|
|
7
|
-
A concise description of merge-tree's current merge conflict resolution strategy is as follows:
|
|
8
|
-
|
|
9
|
-
- Insertion of a text segment only conflicts with other insertions at the same location.
|
|
10
|
-
The conflict is resolved by inserting the segment added later nearer in the string.
|
|
11
|
-
For example, from an initial state of "abc", if the operations [insert "hi " at 0] from client 1
|
|
12
|
-
and [insert "bye " at 0] from client 2 are sequenced in that order, the resulting state is "bye hi abc".
|
|
13
|
-
- Range operations (delete, annotate) apply to the range at the time the operation was issued.
|
|
14
|
-
Specifically, insertion of a segment into a range that is concurrently deleted or annotated
|
|
15
|
-
will not result in that inserted segment being deleted or annotated. For example, from an initial state "012",
|
|
16
|
-
the operations [delete the range [1, 3)] from client 1 and [insert "hi" at index 2 (i.e. between "1" and "2")] from client 2,
|
|
17
|
-
the resulting text is "0hi".
|
|
18
|
-
|
|
19
|
-
The merge outcomes for ranges are easy to understand, but not always desirable.
|
|
20
|
-
Oftentimes, when consumers want to work with ranges, they may want their operation to apply to concurrently inserted segments.
|
|
21
|
-
In the example above, these semantics would look like so:
|
|
22
|
-
|
|
23
|
-
```
|
|
24
|
-
// Initial state at seq 0: "012"
|
|
25
|
-
{ seq: 1, refSeq: 0, clientId: 1, op: <insert "hi" at index 2> }
|
|
26
|
-
{ seq: 2, refSeq: 0, clientId: 2, op: <delete the range [1, 3)> }
|
|
27
|
-
// final desired state: "0"
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
```
|
|
31
|
-
// Initial state at seq 0: "012"
|
|
32
|
-
{ seq: 1, refSeq: 0, clientId: 2, op: <delete the range [1, 3)> }
|
|
33
|
-
{ seq: 2, refSeq: 0, clientId: 1, op: <insert "hi" at index 2> }
|
|
34
|
-
// final desired state: "0"
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
A `SharedString` feature request for a removal operation with these semantics dubbed them "obliterate".
|
|
38
|
-
|
|
39
|
-
At an implementation level, these semantics can be viewed in two parts:
|
|
40
|
-
|
|
41
|
-
- The range specification is resolved at the time the op is sequenced
|
|
42
|
-
- Any subsequent segments inserted into that range concurrently should also be removed
|
|
43
|
-
|
|
44
|
-
The first clause handles concurrent inserts before the removal is sequenced, and the second clause handles concurrent inserts after the removal is sequenced.
|
|
45
|
-
|
|
46
|
-
However, there is a way to view obliterate's semantics as a special case of a "move" operation,
|
|
47
|
-
which preserves content identity such that concurrently inserted segments will be inserted to the range at its current location.
|
|
48
|
-
A main motivator here from the app perspective might be the idea that if user 1 cut and pastes an entire paragraph to a different section of the document
|
|
49
|
-
whiler user 2 edits it, the desired merge outcome would likely be for user 2's edit to apply to the paragraph in its new location.
|
|
50
|
-
Roughly, anywhere an application would want obliterate merge semantics on user delete of some content,
|
|
51
|
-
the same application would want move semantics if the user instead cut and pasted the content somewhere else.
|
|
52
|
-
|
|
53
|
-
There have historically been feature requests for move semantics inside merge-tree (for example [issue 8518](https://github.com/microsoft/FluidFramework/issues/8518)),
|
|
54
|
-
so it makes sense to do forward-thinking on implementing obliterate in a way that we can extend it to cover move semantics in the future.
|
|
55
|
-
|
|
56
|
-
For that reason, naming choices of fields and semantics for the remainder of the document will be written in terms of obliterate being the special case
|
|
57
|
-
"move this range out of existence".
|
|
58
|
-
This should alleviate any back-compat issues if/when we do decide to implement move (esp. fields that end up in ops or snapshots).
|
|
59
|
-
The current proposal is to use the runtime value "null" to represent "out of existence", but this choice is flexible.
|
|
60
|
-
In prose, for terseness that operation will still be called obliterate.
|
|
61
|
-
After describing obliterate's design, this document [digs into how the design can be extended to work for move](##Move).
|
|
62
|
-
|
|
63
|
-
Notice that the above examples always insert text at positions strictly inside the removed range.
|
|
64
|
-
If the insert operation was instead before the "1" or after the "2", one can imagine different applications wanting different behavior:
|
|
65
|
-
either the obliterated region should expand to include that text, or it should not.
|
|
66
|
-
This topic will be covered in the [endpoint behavior](#endpoint-behavior) section,
|
|
67
|
-
but for eventual consistency strategy discussion one should assume that the design should generally support both options
|
|
68
|
-
(and either leave it up to merge-tree to restrict degrees of freedom as it seems fit).
|
|
69
|
-
|
|
70
|
-
## Eventual Consistency Strategy
|
|
71
|
-
|
|
72
|
-
This section is focused on how one could implement the "obliterate" semantics inside merge tree in an eventually consistent fashion.
|
|
73
|
-
This will constitute the bulk of the complexity of the feature.
|
|
74
|
-
Since obliterate is generally a "different kind of remove," there may be a nice abstraction to introduce at the code level to generalize
|
|
75
|
-
removal information. However, in favor of introducing niceties later this design document will assume fields are inlined and focus on
|
|
76
|
-
the strategy for ensuring eventual consistency. If such an abstraction is introduced, ideally it would enable better "pay-to-play" of
|
|
77
|
-
common code paths based on merge-tree feature usage.
|
|
78
|
-
As an example, `BaseSegment.split()` needs to copy segment properties to the split segment.
|
|
79
|
-
So new properties added to segment will unnecessarily copy undefined values.
|
|
80
|
-
|
|
81
|
-
There are a few aspects of merge tree's bookkeeping and general feature set that require consideration when designing new op semantics:
|
|
82
|
-
|
|
83
|
-
- Any changes to direct fields of tree nodes themselves (either new data or changes to bookkeeping of existing data)
|
|
84
|
-
- How the feature interacts with an increasing collab window and zamboni
|
|
85
|
-
- Impact on the partial lengths scheme
|
|
86
|
-
- Bookkeeping and handling of overlapping removals (note some may be obliterates and some may not be)
|
|
87
|
-
- Reconnection
|
|
88
|
-
- Snapshotting impact
|
|
89
|
-
|
|
90
|
-
We'll first present an overview of a potential scheme for implementing the obliterate op, then comment on these aspects.
|
|
91
|
-
|
|
92
|
-
### High-level bookkeeping changes
|
|
93
|
-
|
|
94
|
-
Segments will be augmented with `movedSeq` and `localMovedSeq` fields which generally align with the semantics of `seq, localSeq, removedSeq,` and `localRemovedSeq`.
|
|
95
|
-
When segments are moved and not just obliterated, they will also contain a reference to the destination segment.
|
|
96
|
-
This may look as follows:
|
|
97
|
-
|
|
98
|
-
```typescript
|
|
99
|
-
/**
|
|
100
|
-
* Tracks information about when and where this segment was moved to.
|
|
101
|
-
* @example - Suppose a merge tree had 3 TextSegments "X", "A", and "B", and
|
|
102
|
-
* received the operation `move({ start: 0, end: 1 }, { dest: 3 }, { seq: 30 })` (moving the "X"
|
|
103
|
-
* after the "A" and the "B").
|
|
104
|
-
* After processing this operation, it would have the segments `[<moved "X" tombstone>, "A", "B", "X"]`.
|
|
105
|
-
* The moved "X" tombstone segment would have the following IMoveInfo: `{ movedSeq: 30, moveDst: <reference to living "X" segment>}`
|
|
106
|
-
*/
|
|
107
|
-
export interface IMoveInfo {
|
|
108
|
-
/**
|
|
109
|
-
* Local seq at which this segment was moved if the move is yet-to-be acked. Only set on the tombstone "source" segment of the move.
|
|
110
|
-
*/
|
|
111
|
-
localMovedSeq?: number;
|
|
112
|
-
/**
|
|
113
|
-
* Seq at which this segment was moved. Only set on the tombstone "source" segment of the move.
|
|
114
|
-
*/
|
|
115
|
-
movedSeq: number;
|
|
116
|
-
/**
|
|
117
|
-
* A reference to the inserted destination segment corresponding to this segment's move.
|
|
118
|
-
* If undefined, the move was an obliterate.
|
|
119
|
-
*/
|
|
120
|
-
moveDst?: ReferencePosition;
|
|
121
|
-
|
|
122
|
-
/**
|
|
123
|
-
* List of client IDs that have moved this segment.
|
|
124
|
-
* The client that actually moved the segment (i.e. whose move op was sequenced first) is stored as the first
|
|
125
|
-
* client in this list. Other clients in the list have all issued concurrent ops to move the segment.
|
|
126
|
-
*/
|
|
127
|
-
movedClientIds: number[];
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
export interface ISegment extends Partial<IRemovalInfo>, Partial<IMoveInfo> {
|
|
131
|
-
// ...
|
|
132
|
-
}
|
|
133
|
-
```
|
|
134
|
-
|
|
135
|
-
The `moveDst` reference position functions as a redirection pointer when another client attempts to concurrently insert into the moved range: the usual approach
|
|
136
|
-
for locating a node at some `{ pos, refSeq, clientId }` applies, and if the resulting segment has been moved, one can follow the trail of moves to find the segment's
|
|
137
|
-
current location.
|
|
138
|
-
|
|
139
|
-
Note that though `movedSeq` and `localMovedSeq` act very similarly to `removedSeq` and `localRemovedSeq` when considering the length of a segment at a given
|
|
140
|
-
perspective: if the perspective is from after the segment was moved, the tombstone segment should have length 0.
|
|
141
|
-
However, these fields need to be independent from `removedSeq` due to the possibility of a removal and a move overlapping, as well as the differences
|
|
142
|
-
in how concurrent inserts are handled into a removed or a moved range.
|
|
143
|
-
|
|
144
|
-
### Remote perspective
|
|
145
|
-
|
|
146
|
-
We now move to some lower-level implementation details on how to ensure eventual consistency operating in this model.
|
|
147
|
-
|
|
148
|
-
First, consider the behavior a client must have when processing an obliterate op it didn't submit.
|
|
149
|
-
For concreteness and ease of explanation, say this op is `{ seq: 50, refSeq: 40, clientId: 2, op: <move the range [10, 15) to null }`.
|
|
150
|
-
The processing client should first mark all segments between the segment `getContainingSegment({ pos: 10, refSeq: 40, clientId: 2 })` and
|
|
151
|
-
`getContainingSegment({ pos: 15, refSeq: 40, clientId: 2 })` that are alive (i.e. inserted, not removed) from the perspective
|
|
152
|
-
`{ seq: 50, clientId: localClientId }` obliterated.
|
|
153
|
-
Note this means that if a segment in the range was concurrently removed, it won't be marked as moved as well.
|
|
154
|
-
The marking process should be roughly equivalent to what happens in a "remove" operation, but instead of updating `removedSeq`/`localRemovedSeq`
|
|
155
|
-
it updates `movedSeq` and `localMovedSeq`.
|
|
156
|
-
|
|
157
|
-
The other interesting difference between this operation and a normal removal is its inclusion of segments inserted between seq 40 and seq 50.
|
|
158
|
-
The current API on merge tree used for `markRangeRemoved` (which is `mapRange`) doesn't support iterating in this fashion,
|
|
159
|
-
but could easily be extended to do so.
|
|
160
|
-
One way to do that would be to decouple the `refSeq` and length calculations used for locating the positions and the `refSeq` used for
|
|
161
|
-
deciding whether or not to descend and `map` children nodes.
|
|
162
|
-
|
|
163
|
-
This handles removal of any concurrently inserted segments sequenced before the obliterate op, as well as local ops sequenced after the
|
|
164
|
-
obliterate op (since we use `localClientId`).
|
|
165
|
-
However, the client still needs to ensure concurrently inserted segments sequenced after the obliterate op are immediately removed.
|
|
166
|
-
The insert codepath will therefore need to take into account if the destination is inside of an ongoing moved area.
|
|
167
|
-
Excursions are a good tool for this job, but checking is still easier said than done.
|
|
168
|
-
Concretely, and continuing with the example operations given above, suppose this insertion happens:
|
|
169
|
-
|
|
170
|
-
```
|
|
171
|
-
{ seq: 60, refSeq: 40, clientId: 3, op: <insert "hello" at index 10> }
|
|
172
|
-
```
|
|
173
|
-
|
|
174
|
-
After locating the insertion point and updating the merge tree, we need to decide if the resulting segment is inside of a moved region.
|
|
175
|
-
If we happened to know the `seq` of the move we were testing for, this would be easy: the first adjacent segment in each direction from
|
|
176
|
-
the perspective of `{ seq: 50, clientId: localClientId }` can inform us if we're either inside or directly adjacent to that moved range.
|
|
177
|
-
Thus, a naive implementation could check all sequence numbers in the collab window.
|
|
178
|
-
The obvious optimization of only checking seq numbers of move ops would improve this slightly.
|
|
179
|
-
But we can do asymptotically better by leveraging the tree structure.
|
|
180
|
-
It would be ideal if we only needed to perform one commonly short excursion in each direction.
|
|
181
|
-
The only candidate that makes much sense is from the perspective of `{ seq: 60, clientId: 3 }` (i.e. the client submitting the insert op at
|
|
182
|
-
the time the op is sequenced).
|
|
183
|
-
The problem with this perspective is that ops 51 through 59 may have inserted a segment between the inserted "hello"
|
|
184
|
-
and the obliterated range that was submitted by a client which has already acked the obliterate.
|
|
185
|
-
For example, `{ seq: 55, clientId: 5, refSeq: 50, op: <insert "i won't be obliterated" at index 10> }`.
|
|
186
|
-
|
|
187
|
-
The forward excursion would need to continue past this segment in order to conclude it isn't in an obliterated range.
|
|
188
|
-
If that was the only such concurrent insert, the next segment it would visit would be an obliterated one and we'd decide
|
|
189
|
-
whether or not to include the newly inserted segment as part of the obliterated region based on some endpoint merge strategy.
|
|
190
|
-
|
|
191
|
-
The key insight is that visiting the segment with seq 55 does provide the excursion with information: since the segment
|
|
192
|
-
was inserted at seq 55 and isn't moved or removed, any move operation must have occurred before seq 55.
|
|
193
|
-
If we keep track of the smallest sequence number of alive segments that we've visited, we therefore have an upper bound
|
|
194
|
-
for any possible adjacent move op.
|
|
195
|
-
Thus, we can halt the excursion as soon as this upper bound falls below the smallest obliterate operation within the collab window.
|
|
196
|
-
If we alternatively reach a segment that has been moved concurrently to the insert we're processing, we can also stop
|
|
197
|
-
and use some endpoint resolution strategy.
|
|
198
|
-
|
|
199
|
-
The guarantee we get for a removed segment isn't quite as good: we only know that the move must have come either before
|
|
200
|
-
the segment was inserted or after it was removed (since move doesn't impact segments that are removed before its application).
|
|
201
|
-
We _could_ track this as part of our excursion by maintaining a range of disjoint intervals at which an obliterate "might have happened"
|
|
202
|
-
and exiting as soon as we know no obliterate is possible, but this is probably more effort than required: only decreasing our upper bound
|
|
203
|
-
for removed segments if our existing upper bound is below when the segment was removed is a reasonable intermediate approach that uses
|
|
204
|
-
less bookkeeping overhead.
|
|
205
|
-
|
|
206
|
-
All-in-all, the insert logic modification might look something like this:
|
|
207
|
-
|
|
208
|
-
```typescript
|
|
209
|
-
function wasRemovedAfter(seg: ISegment, seq: number): boolean {
|
|
210
|
-
return seg.removedSeq !== UnassignedSequenceNumber && seg.removedSeq > seq;
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
function insertingWalk(args /* mostly omitted */, op) {
|
|
214
|
-
/* regular insert logic goes here */
|
|
215
|
-
|
|
216
|
-
let moveUpperBound = Number.POSITIVE_INFINITY;
|
|
217
|
-
let movedSegment: ISegment | undefined = undefined;
|
|
218
|
-
const smallestSeqMoveOp = this.getSmallestSeqMoveOp();
|
|
219
|
-
const findAdjacedMovedSegment = (seg) => {
|
|
220
|
-
if (seg.movedSeq && seg.movedSeq > op.referenceSequenceNumber) {
|
|
221
|
-
movedSegment = seg;
|
|
222
|
-
return false;
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
if (!isRemovedAndAcked(seg) || wasRemovedAfter(seg, moveUpperBound)) {
|
|
226
|
-
moveUpperBound = Math.min(moveUpperBound, seg.seq);
|
|
227
|
-
}
|
|
228
|
-
// If we've reached a segment that existed before any of our in-collab-window move ops
|
|
229
|
-
// happened, no need to continue.
|
|
230
|
-
return moveUpperBound > smallestSeqMoveOp;
|
|
231
|
-
};
|
|
232
|
-
forwardExcursion(insertSegment, findAdjacedMovedSegment);
|
|
233
|
-
const furtherMovedSegment = movedSegment;
|
|
234
|
-
currentMin = Number.POSITIVE_INFINITY;
|
|
235
|
-
movedSeg = undefined;
|
|
236
|
-
backwardExcursion(insertSegment, findAdjacedMovedSegment);
|
|
237
|
-
const nearerMovedSegment = movedSegment;
|
|
238
|
-
if (
|
|
239
|
-
(nearerMovedSegment && breakEndpointTie(nearerMovedSegment, insertSegment, op)) ||
|
|
240
|
-
(furtherMovedSegment && breakEndpointTie(insertSegment, furtherMovedSegment, op))
|
|
241
|
-
) {
|
|
242
|
-
// These objects will be analogous to return from `toRemovalInfo`.
|
|
243
|
-
const nearMoveInfo = toMoveInfo(nearerMovedSegment);
|
|
244
|
-
const farMoveInfo = toMoveInfo(furtherMovedSegment);
|
|
245
|
-
// The inserted segment could potentially be adjacent to two different moved regions.
|
|
246
|
-
// We mark it as moved using the info from the earlier such operation.
|
|
247
|
-
const moveInfo = min(nearMoveInfo, farMoveInfo);
|
|
248
|
-
markSegmentMoved(insertSegment, moveInfo, op);
|
|
249
|
-
}
|
|
250
|
-
}
|
|
251
|
-
```
|
|
252
|
-
|
|
253
|
-
In reality it will be a bit more complicated: this does not properly handle inserting walks performed for local edits (which should never be immediately obliterated),
|
|
254
|
-
nor does it handle local, un-acked obliterates (which will be covered in the next section).
|
|
255
|
-
It's worth noting that removals between the obliterated seq and the inserting op's seq don't complicate things much because excursions visit all segments, regardless of visibility.
|
|
256
|
-
|
|
257
|
-
This limits the segment excursions to not be longer than the number of consecutive segments adjacent to the insertion
|
|
258
|
-
point that are all within the collaboration window.
|
|
259
|
-
That's probably performant enough, but if we want to optimize further at some memory cost it is probably possible to use the
|
|
260
|
-
partialLengths information to skip over blocks in some cases if the sequence numbers of obliterate ops are stored on
|
|
261
|
-
each merge block.
|
|
262
|
-
|
|
263
|
-
### Local perspective
|
|
264
|
-
|
|
265
|
-
Next, we move to the local handling of a move op while it's in flight.
|
|
266
|
-
For consistency with the rest of merge tree's segment state machine, the state transitions of `{ localMovedSeq, movedSeq }` and `{ localRemovedSeq, removedSeq }` should align (`movedSeq` is set to `UnassignedSeqNumber` while the op is in flight with `localMovedSeq` recording the local seq at which the move happened, then on ack of the op `localMovedSeq` is cleared out and `movedSeq` is replaced with the op's seq).
|
|
267
|
-
|
|
268
|
-
While a move op is in flight, any non-local insertions into a locally moved range need to be immediately moved to the range's current location
|
|
269
|
-
(or removed, if it was obliterated).
|
|
270
|
-
This can be accomplished by tweaking the `findAdjacentMovedSegment` function above to account for `localMovedSeq`:
|
|
271
|
-
|
|
272
|
-
```typescript
|
|
273
|
-
const findAdjacentMovedSegment = (seg) => {
|
|
274
|
-
if (
|
|
275
|
-
(seg.movedSeq && seg.movedSeq > op.referenceSequenceNumber) ||
|
|
276
|
-
seg.localMovedSeq !== undefined
|
|
277
|
-
) {
|
|
278
|
-
movedSegment = seg;
|
|
279
|
-
return false;
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
if (!isRemovedAndAcked(seg) || wasRemovedAfter(seg, moveUpperBound)) {
|
|
283
|
-
moveUpperBound = Math.min(moveUpperBound, seg.seq);
|
|
284
|
-
}
|
|
285
|
-
// If we've reached a segment that existed before any of our in-collab-window move ops
|
|
286
|
-
// happened, no need to continue.
|
|
287
|
-
return moveUpperBound > smallestSeqMoveOp;
|
|
288
|
-
};
|
|
289
|
-
```
|
|
290
|
-
|
|
291
|
-
We don't need to worry about the analogous problem of extending the excursion as a result of segments between the insert location and a local move
|
|
292
|
-
because any such segments would have also been marked as locally moved when they were inserted into the merge tree.
|
|
293
|
-
In the sample code written for the remote segment, this will also necessitate `markSegmentMoved` to tolerate marking segments with local obliteration info.
|
|
294
|
-
|
|
295
|
-
Much of the same logic that goes into conflicting local + remote removal will need to be applied for move.
|
|
296
|
-
Nothing stands out as a conceptual issue or hurdle in this realm, though. Just tricky conditionals.
|
|
297
|
-
|
|
298
|
-
Once the op is acked, the behavior in the [Remote perspective](#remote-perspective) section suffices for any further concurrent segments.
|
|
299
|
-
|
|
300
|
-
### Other aspects
|
|
301
|
-
|
|
302
|
-
#### Zamboni
|
|
303
|
-
|
|
304
|
-
Zamboni will need updating to account for the new bookkeeping fields, but there aren't any conceptual issues in this realm since zamboni cleans up unnecessary data for segments outside of the collaboration window and the only difference between remove and obliterate happens within the collab window.
|
|
305
|
-
|
|
306
|
-
#### Snapshot
|
|
307
|
-
|
|
308
|
-
Segments in the snapshot will need to serialize and rehydrate the newly added properties.
|
|
309
|
-
Most of the types are plain-old data and JSON.serialize with no issue.
|
|
310
|
-
When move is implemented (and so `moveDst` can actually be a local reference rather than undefined), that field will need some special handling.
|
|
311
|
-
Several schemes are possible, but in the end it should convert to either a `pos` within some view of the merge-tree or an index+offset into the array
|
|
312
|
-
of serialized segments.
|
|
313
|
-
|
|
314
|
-
#### Reconnection
|
|
315
|
-
|
|
316
|
-
When a move op is rebased, there will need to be local fixup of the range marked moved locally, since the resulting range may expand with different semantics (different ops
|
|
317
|
-
will be concurrent to the rebased version). Since locally applying a move doesn't impact any sequenced segment state (and merge policy is to override pending local moves
|
|
318
|
-
with any remote ones just like the remove merge policy), at worst this can be done unperformantly by walking the range, resetting state, and re-applying.
|
|
319
|
-
|
|
320
|
-
The methods necessary for interpreting where the new range should be in the rebased view of the local merge-tree already exist and are used for regular reconnect (e.g.
|
|
321
|
-
to remove a range of content), so should not present additional trouble.
|
|
322
|
-
|
|
323
|
-
#### Partial Lengths
|
|
324
|
-
|
|
325
|
-
One key capability of merge-tree is its ability to resolve the information `{ pos, clientId, refSeq }` (and potentially `localSeq` if the local client) into a particular
|
|
326
|
-
segment + offset in the merge-tree's leaves.
|
|
327
|
-
It does this efficiently by storing indexing structures at each internal node that allow querying for that node's length at any such perspective within the collab window,
|
|
328
|
-
then leveraging those structures in an efficient tree walk.
|
|
329
|
-
|
|
330
|
-
Adding additional tree operations that any client can undertake means that all other clients must be able to reason about their peers' current states.
|
|
331
|
-
For example, `movedSeq` and `localMovedSeq` will need to be considered when calculating the length of a node/range from a given perspective.
|
|
332
|
-
If the duplicated segment that's inserted as the result of a move is given the `clientId` of the moving client (as opposed to the originating client)
|
|
333
|
-
and `seq` of the move operation, generally existing partial lengths logic will work correctly for non-concurrently inserted segments if
|
|
334
|
-
`movedSeq` and `localMovedSeq` on the tombstoned segment are interpreted analogously to `removedSeq` and `localRemovedSeq`.
|
|
335
|
-
Note that this would require updating the description of the `clientId` field, and for attribution purposes we may want to track the clientId that originally
|
|
336
|
-
created the segment separately from the clientId that most recently caused the segment to be where it is (via move).
|
|
337
|
-
|
|
338
|
-
Things get more complicated when considering resolution of node lengths for concurrently inserted segments.
|
|
339
|
-
The remainder of this section assumes the content is obliterated rather than moved; there are additional difficulties for partial lengths when dealing with
|
|
340
|
-
overlapping moves not covered in this document (they are probably solvable, but may require changes to the representation of the partial lengths indexing
|
|
341
|
-
structure rather than just its data).
|
|
342
|
-
|
|
343
|
-
Concretely, let's consider how partial lengths might look for a segment concurrently inserted into a moved region.
|
|
344
|
-
|
|
345
|
-
Suppose:
|
|
346
|
-
|
|
347
|
-
```
|
|
348
|
-
// Initial state at seq 0: "0123456789"
|
|
349
|
-
{ seq: 1, refSeq: 0, clientId: 1, op: <move [0, 5) out of existence> }
|
|
350
|
-
{ seq: 2, refSeq: 0, clientId: 2, op: <insert "hi" at 2> }
|
|
351
|
-
{ seq: 3, refSeq: 0, clientId: 2, op: <insert "hello" at 7> }
|
|
352
|
-
```
|
|
353
|
-
|
|
354
|
-
The desired final state in this case would be "56hello789". After seq 2, client 0 (an observer) has segments that look like so (clientIds that aren't relevant are omitted):
|
|
355
|
-
|
|
356
|
-
```
|
|
357
|
-
[
|
|
358
|
-
{ seq: 0, movedSeq: 1, text: "01", movedClientIds: [1] },
|
|
359
|
-
{ seq: 2, movedSeq: 1, clientId: 2, text: "hi", movedClientIds: [1] },
|
|
360
|
-
{ seq: 0, movedSeq: 1, text: "234", movedClientIds: [1] },
|
|
361
|
-
{ seq: 0, text: "56789" }
|
|
362
|
-
]
|
|
363
|
-
```
|
|
364
|
-
|
|
365
|
-
If these segments are all in a single block and the minimum sequence number is 0, their parent's partial lengths resembles the following:
|
|
366
|
-
|
|
367
|
-
```
|
|
368
|
-
{
|
|
369
|
-
minLength: 10 // length of "0123456789"
|
|
370
|
-
partialLengths: [{ seq: 1, seglen: -5 }, { seq: 2, seglen: 0 }],
|
|
371
|
-
clientSeqNumbers: [[], [{ seq: 1, seglen: -5 }], /* client 2 */[ ?? ]]
|
|
372
|
-
}
|
|
373
|
-
```
|
|
374
|
-
|
|
375
|
-
This data reflects the fact that the subsequence starts at length 10 at seq 0, an observer client sees the length of the subsequence shrink by 5 at seq 1,
|
|
376
|
-
and doesn't see the length change afterward (note such a client hasn't yet received seq 3). It also looks correct for resolving client 1's perspective: even if
|
|
377
|
-
the refSeq isn't at least 1, `clientSeqNumbers[1]` will still cause the current client's interpretation of client 1's view to include the removal of the
|
|
378
|
-
range `[0, 5)`. Client 2 is the tricky one: the length of the block from client 2's perspective should be 12 at refSeq 0, but 5 at either refSeq 1 or 2.
|
|
379
|
-
It looks odd, but this can be accomplished by adding a `{ seq: 1 /* comes from movedSeq */, seglen: 2 }` entry to `clientSeqNumbers[2]`.
|
|
380
|
-
The intuition is that client 2 counts the length of the segment unless `seq >= movedSeq`, and the method used in partial lengths computes the length of
|
|
381
|
-
a subsequence using
|
|
382
|
-
|
|
383
|
-
(length at min seq) + (any deltas between minSeq and refSeq) + (any deltas for ops submitted by remote client between refSeq and now),
|
|
384
|
-
|
|
385
|
-
so the last term counts this entry precisely when it's desired.
|
|
386
|
-
|
|
387
|
-
What happens if the insert and the obliterate are concurrent but sequenced in the other order?
|
|
388
|
-
|
|
389
|
-
```
|
|
390
|
-
// Initial state at seq 0: "0123456789"
|
|
391
|
-
{ seq: 1, refSeq: 0, clientId: 2, op: <insert "hi" at 2> }
|
|
392
|
-
{ seq: 2, refSeq: 0, clientId: 1, op: <move [0, 5) out of existence> }
|
|
393
|
-
{ seq: 3, refSeq: 0, clientId: 2, op: <insert "hello" at 7> }
|
|
394
|
-
```
|
|
395
|
-
|
|
396
|
-
The segment state after seq 2 from client 0's perspective will look mostly the same:
|
|
397
|
-
|
|
398
|
-
```
|
|
399
|
-
[
|
|
400
|
-
{ seq: 0, movedSeq: 2, text: "01", movedClientIds: [1] },
|
|
401
|
-
{ seq: 1, movedSeq: 2, clientId: 2, text: "hi", movedClientIds: [1] },
|
|
402
|
-
{ seq: 0, movedSeq: 2, text: "234", movedClientIds: [1] },
|
|
403
|
-
{ seq: 0, text: "56789" }
|
|
404
|
-
]
|
|
405
|
-
```
|
|
406
|
-
|
|
407
|
-
And the partial lengths object might look like this:
|
|
408
|
-
|
|
409
|
-
```
|
|
410
|
-
{
|
|
411
|
-
minLength: 10 // length of "0123456789"
|
|
412
|
-
partialLengths: [{ seq: 1, seglen: 2 }, { seq: 2, seglen: -7 }],
|
|
413
|
-
clientSeqNumbers: [[], [{ seq: 1, seglen: 2 }, { seq: 2, seglen: -7 }], /* client 2 */[{ seq: 1, seglen: 2 }]]
|
|
414
|
-
}
|
|
415
|
-
```
|
|
416
|
-
|
|
417
|
-
Note that in this case, client 1's `clientSeqNumbers` needed to be fixed up to include an entry for the concurrently inserted segment.
|
|
418
|
-
Thus, when an obliterate/move affects a concurrently inserted segment, it's generally possible to modify the generated partial lengths'
|
|
419
|
-
`clientSeqNumbers` for the client that sequenced its concurrent op later using the information on the inserted segment to interpret
|
|
420
|
-
correct values.
|
|
421
|
-
|
|
422
|
-
This strategy is also consistent with the existing strategy for overlapping delete: see the following snippet from `addClientSeqNumberFromPartial`:
|
|
423
|
-
|
|
424
|
-
```typescript
|
|
425
|
-
if (partialLength.overlapRemoveClients) {
|
|
426
|
-
partialLength.overlapRemoveClients.map((oc: Property<number, IOverlapClient>) => {
|
|
427
|
-
// Original client entry was handled above
|
|
428
|
-
if (partialLength.clientId !== oc.data.clientId) {
|
|
429
|
-
this.addClientSeqNumber(oc.data.clientId, partialLength.seq, oc.data.seglen);
|
|
430
|
-
}
|
|
431
|
-
return true;
|
|
432
|
-
});
|
|
433
|
-
}
|
|
434
|
-
```
|
|
435
|
-
|
|
436
|
-
The other interesting case to go through is when an obliterate/move conflicts with another obliterate/move.
|
|
437
|
-
|
|
438
|
-
##### Review of overlapping removal
|
|
439
|
-
|
|
440
|
-
This section illustrates the basic existing handling for overlapping removal.
|
|
441
|
-
It can probably be skipped by readers familiar with the scheme, but is here to help the reader determine where assumptions may break down or go wrong
|
|
442
|
-
for overlapping obliterate/move.
|
|
443
|
-
|
|
444
|
-
Overlapping removal of a segment is tracked using the `removedClientIds` field, which is used in partial lengths to add adjustment entries to avoid double-counting
|
|
445
|
-
removal.
|
|
446
|
-
For example, suppose client 1 and client 2 concurrently remove the range `[0, 5)` and each performs some more ops before acking the others' remove.
|
|
447
|
-
That might look something like this:
|
|
448
|
-
|
|
449
|
-
```
|
|
450
|
-
// Initial state at seq 0: "0123456789"
|
|
451
|
-
{ seq: 1, refSeq: 0, clientId: 1, op: <remove [0, 5)> }
|
|
452
|
-
{ seq: 2, refSeq: 0, clientId: 2, op: <remove [0, 5)> }
|
|
453
|
-
{ seq: 3, refSeq: 0, clientId: 2, op: <insert "hi" at 2> }
|
|
454
|
-
```
|
|
455
|
-
|
|
456
|
-
The correct final state is "56hi789". Consider what happens when a listener client (say, client 0) attempts to interpret the insertion of "hi" by client 2.
|
|
457
|
-
Before processing, its merge tree segment state would look like so:
|
|
458
|
-
|
|
459
|
-
```
|
|
460
|
-
[
|
|
461
|
-
{ seq: 0, removedSeq: 1, removedClientIds: [1, 2], text: "01234" },
|
|
462
|
-
{ seq: 0, text: "56789" }
|
|
463
|
-
]
|
|
464
|
-
```
|
|
465
|
-
|
|
466
|
-
The constructed partial lengths object for the root of the merge tree would then be:
|
|
467
|
-
|
|
468
|
-
```
|
|
469
|
-
{
|
|
470
|
-
minLength: 10,
|
|
471
|
-
partialLengths: [{ seq: 1, seglen: -5 }],
|
|
472
|
-
clientSeqNumbers: [[], /* client 1 */[{ seq: 1, seglen: -5 }], /* client 2 */[{ seq: 1, seglen: -5 }]]
|
|
473
|
-
}
|
|
474
|
-
```
|
|
475
|
-
|
|
476
|
-
Note that client 2's delta applies from seq 1 onward rather than seq 2, since it's constructed using the `seq` and `removedClientIds` on the removed segment.
|
|
477
|
-
|
|
478
|
-
Client 0 would determine where to insert the op with seq 3 by:
|
|
479
|
-
|
|
480
|
-
1. Asking the root for its length at `{ clientId: 2, refSeq: 0 }`
|
|
481
|
-
|
|
482
|
-
- This calculation is based on (length at min seq) + (any deltas between minSeq and refSeq) + (any deltas for ops submitted by client 2) - (deltas submitted by client 2 before refSeq)
|
|
483
|
-
- From the above bookkeeping, it would compute 10 + 0 + (-5) - 0 = 5
|
|
484
|
-
|
|
485
|
-
2. Asking for the length of the first child at `{ clientId: 2, refSeq: 0 }`
|
|
486
|
-
|
|
487
|
-
- Conditionals here are a bit tedious, but we'd see that clientId 2 is in the segment's removedClientId list, so it has length 0
|
|
488
|
-
|
|
489
|
-
3. Asking for the length of the second child `{ clientId: 2, refSeq: 0 }`
|
|
490
|
-
|
|
491
|
-
- The segment is inserted and not removed, so it has length 5.
|
|
492
|
-
Since the search is looking for an accumulated position of 2, it determines that the correct insertion point is amidst this segment.
|
|
493
|
-
|
|
494
|
-
##### Overlapping obliterate
|
|
495
|
-
|
|
496
|
-
The same general strategy used for overlapping removes should be sufficient for tracking overlapping obliteration of segments.
|
|
497
|
-
It relies only on information about when and by who a segment was removed, and the main difference between remove and obliterate comes
|
|
498
|
-
from which segments they affect rather than how the segments are affected.
|
|
499
|
-
|
|
500
|
-
Note also that because `movedSeq` is distinct from `removedSeq`, the corresponding partial lengths entry for `movedClientIds[0]` obliterating the segment can be entered
|
|
501
|
-
distinctly from the partial lengths entry for `removedClientIds[0]` removing the segment.
|
|
502
|
-
|
|
503
|
-
Again, the interesting case to check is if two separate clients issue obliterate ops amidst a concurrent insert (otherwise it is functionally identical to the remove case).
|
|
504
|
-
|
|
505
|
-
```
|
|
506
|
-
// Initial state at seq 0: "0123456789"
|
|
507
|
-
{ seq: 1, refSeq: 0, clientId: 1, op: <obliterate [0, 5)> }
|
|
508
|
-
{ seq: 2, refSeq: 0, clientId: 2, op: <obliterate [0, 5)> }
|
|
509
|
-
{ seq: 3, refSeq: 0, clientId: 3, op: <insert "hi" at 2> }
|
|
510
|
-
```
|
|
511
|
-
|
|
512
|
-
The segment state of some observing client after seq 3 is essentially the same as in the non-overlapping example:
|
|
513
|
-
|
|
514
|
-
```
|
|
515
|
-
[
|
|
516
|
-
{ seq: 0, movedSeq: 1, text: "01", movedClientIds: [1, 2] },
|
|
517
|
-
{ seq: 3, movedSeq: 1, clientId: 3, text: "hi", movedClientIds: [1, 2] },
|
|
518
|
-
{ seq: 0, movedSeq: 1, text: "234", movedClientIds: [1, 2] },
|
|
519
|
-
{ seq: 0, text: "56789" }
|
|
520
|
-
]
|
|
521
|
-
```
|
|
522
|
-
|
|
523
|
-
From the observing client perspective, the interpretation of each client's text if they were to submit an op with refSeq 0 through 3 is as follows:
|
|
524
|
-
|
|
525
|
-
| refSeq | client 1 | client 2 | client 3 |
|
|
526
|
-
| 0 | 56789 | 56789 | 01hi23456789 |
|
|
527
|
-
| 1 | 56789 | 56789 | 56789 |
|
|
528
|
-
| 1 | 56789 | 56789 | 56789 |
|
|
529
|
-
| 1 | 56789 | 56789 | 56789 |
|
|
530
|
-
|
|
531
|
-
The corresponding lengths table is exactly what's achieved by combining the overlapping remove strategy with the strategy for bookkeeping concurrently inserted segments:
|
|
532
|
-
|
|
533
|
-
```
|
|
534
|
-
{
|
|
535
|
-
minLength: 10,
|
|
536
|
-
partialLengths: [{ seq: 1, seglen: -5 }, { seq: 3, seglen: 0 }],
|
|
537
|
-
clientSeqNumbers: [
|
|
538
|
-
[],
|
|
539
|
-
[{ seq: 1, seglen: -5 }],
|
|
540
|
-
[{ seq: 1, seglen: -5 }], /* comes from adding clientSeqNumber to all entries in removedClientIds */
|
|
541
|
-
[{ seq: 1, seglen: 2 }] /* comes from the inserted "hi" segment which has movedSeq <= seq */
|
|
542
|
-
]
|
|
543
|
-
}
|
|
544
|
-
```
|
|
545
|
-
|
|
546
|
-
This approach works if the operations are sequenced in the other order or intermediately as well.
|
|
547
|
-
|
|
548
|
-
## Endpoint Behavior
|
|
549
|
-
|
|
550
|
-
One important consideration is what happens near the endpoints of the removed range.
|
|
551
|
-
There are two general possibilities: either the obliterate expands to include segments inserted
|
|
552
|
-
adjacent to the endpoint, or it doesn't.
|
|
553
|
-
|
|
554
|
-
In the initial implementation, we should pick some fixed endpoint behavior analogous to how insertion merge policy is fixed.
|
|
555
|
-
Each option is roughly equivalent in difficulty.
|
|
556
|
-
|
|
557
|
-
If applications request more degrees of freedom in this area, the framework for merge outcomes described in [Move](##Move) is a good starting point.
|
|
558
|
-
|
|
559
|
-
## Public API
|
|
560
|
-
|
|
561
|
-
The public API of sequence will need to be updated for users to leverage the obliterate operation. The most obvious way to extend it would be to align the API shape with
|
|
562
|
-
`removeRange`:
|
|
563
|
-
|
|
564
|
-
```typescript
|
|
565
|
-
class SharedSegmentSequence<TInterval extends IInterval> {
|
|
566
|
-
public obliterateRange(start: number, end: number);
|
|
567
|
-
}
|
|
568
|
-
```
|
|
569
|
-
|
|
570
|
-
One interesting alternative is to align the public API of sequence with the idea that there are two conceptual kinds of ranges: slice ranges and set ranges (see the next section
|
|
571
|
-
for details).
|
|
572
|
-
If we did this, we might instead unify `removeRange` and `obliterateRange` into a single method taking in such a range object.
|
|
573
|
-
This would have the nice property of naturally extending to annotate operations, if we anticipate wanting to be able to annotate slice ranges.
|
|
574
|
-
|
|
575
|
-
## Move
|
|
576
|
-
|
|
577
|
-
There are several different possible options for defining merge outcomes for the "move" operation.
|
|
578
|
-
The upcoming SharedTree DDS has done a lot of thinking in this area and landed on a relatively simple set of semantics that give reasonable
|
|
579
|
-
outcomes in most cases (see [issue 9658](https://github.com/microsoft/FluidFramework/issues/9658) for some very detailed reading).
|
|
580
|
-
|
|
581
|
-
These semantics are implementable in merge-tree, are compatible with feature requests for obliterate, and generally seem like a good direction to take
|
|
582
|
-
that we can later extend if applications request.
|
|
583
|
-
|
|
584
|
-
There are a few primitive concepts that all of the merge outcomes depend on.
|
|
585
|
-
|
|
586
|
-
First, a sequence of length `N` is conceptualized as an interleaving set of `N+1` _gaps_ and `N` nodes.
|
|
587
|
-
Nodes in the sequence may move, but the gaps between the nodes do not.
|
|
588
|
-
|
|
589
|
-
Insertion into the sequence is performed by specifying a gap to insert in as well as a direction that the inserted content prefers to tend toward
|
|
590
|
-
in case other content is inserted/moved concurrently into the same gap.
|
|
591
|
-
|
|
592
|
-
> Merge-tree already conceptualizes insert locations similarly: it names the gaps `0` through `N`. It does not permit app-level specification of concurrent merges,
|
|
593
|
-
> but that degree of freedom doesn't need to be exposed.
|
|
594
|
-
|
|
595
|
-
Next, there are two types of range specifications: _set ranges_ and _slice ranges_.
|
|
596
|
-
|
|
597
|
-
A _set range_ targets exactly the objects in a given range at the time it was specified. In merge-tree terms, the segments that the range affects are
|
|
598
|
-
resolved from the perspective of the submitting client at its refSeq, and only those segments undergo whatever operation applies (move, annotate, remove).
|
|
599
|
-
|
|
600
|
-
> Merge-tree's `remove` operation has set range semantics, since it doesn't cause removal of any concurrently inserted segments.
|
|
601
|
-
> It's worth noting that a move operation with set range semantics is conceivable inside this framework, and not something merge-tree currently implements.
|
|
602
|
-
> E.g., if the set range "CDE" inside a string "ABCDEF" was moved to the end of the string, and someone concurrently moved "B" and "C" to the start,
|
|
603
|
-
> the string may end up "BAFCDE" or "BCAFDE" depending on the sequencing order of the moves.
|
|
604
|
-
|
|
605
|
-
Finally, a _slice range_ specifies a start location and an end location, where a location has the same object shape as an insert destination: a gap plus a merge direction.
|
|
606
|
-
The range of nodes that the operation affects is interpreted at the time the operation applies, and any concurrent insertions/moves of content _into_ that range
|
|
607
|
-
are also affected. The merge direction should be interpreted as relative to a "phantom segment" in the gap specifying the slice endpoint.
|
|
608
|
-
For example, in the string "ABCDE", the slice range
|
|
609
|
-
`[{ pos: 0, merge: <concurrent segments merge nearer> }, { pos: 3, merge: <concurrent segments merge further> })` referring to "ABC" would
|
|
610
|
-
not expand at either endpoint whereas if the merge options were flipped, it would expand at both endpoints.
|
|
611
|
-
|
|
612
|
-
> These semantics align with the proposed merge-tree `move` operation. Like insert, we can fix the direction things should merge
|
|
613
|
-
> (in this case it instead affects which way the move "expands") if consumers don't need the extra degrees of freedom.
|
|
614
|
-
|
|
615
|
-
Notice that because gaps don't move, this set of outcomes doesn't suffer from problems like a range specification becoming invalid (which happens with
|
|
616
|
-
how the legacy shared-tree assigns semantics to its ops, where each is relative to an id).
|
|
617
|
-
It also gives reasonable merge outcomes which basically amount to "first move wins."
|
|
618
|
-
Consider the following two troublesome cases of overlapping move.
|
|
619
|
-
|
|
620
|
-
#### Move within a move
|
|
621
|
-
|
|
622
|
-
```
|
|
623
|
-
// Initial state: "12345 AB CD"
|
|
624
|
-
{ seq: 1, refSeq: 0, clientId: 1, op: <move 2 through 4 to after "A"> } // (all of the op specification would actually be in terms of indices)
|
|
625
|
-
{ seq: 2, refSeq: 0, clientId: 2, op: <move 3 to after paragraph "C"> }
|
|
626
|
-
```
|
|
627
|
-
|
|
628
|
-
One can see with this order of sequencing, we'd end up with "15 A234B CD".
|
|
629
|
-
With the other order, we'd get "15 A24B C3D".
|
|
630
|
-
|
|
631
|
-
Both outcomes are reasonable; clients 1 and 2 effectively expressed opposing desires on where the 3 should go.
|
|
632
|
-
|
|
633
|
-
#### Move of a single endpoint past the other
|
|
634
|
-
|
|
635
|
-
```
|
|
636
|
-
// Initial state: "Paragraph 1<br>Paragraph 2<br>Paragraph 3<br>Paragraph 4<br>Paragraph 5"
|
|
637
|
-
{ seq: 1, refSeq: 0, clientId: 1, op: <move paragraphs 2 through 3 to the gap after paragraph 5> } // (all of the op specification would actually be in terms of indices)
|
|
638
|
-
{ seq: 2, refSeq: 0, clientId: 2, op: <move paragraphs 3 through 4 to the gap after paragraph 5> }
|
|
639
|
-
```
|
|
640
|
-
|
|
641
|
-
Client 1's op succeeds without conflict, giving intermediate state order of the paragraphs "14523".
|
|
642
|
-
Then client 2's op has a start endpoint targetting a tombstoned segment for paragraph 3, so it only affects paragraph 4.
|
|
643
|
-
The final state is "15423" since merge-tree chose near-merge-later.
|
|
644
|
-
|
|
645
|
-
If the ops are sequenced in the other order, the final state would instead be "15234".
|
|
646
|
-
|
|
647
|
-
Both of these outcomes are again generally plausible.
|
|
@@ -1,199 +0,0 @@
|
|
|
1
|
-
# ReferencePosition Documentation
|
|
2
|
-
|
|
3
|
-
ReferencePositions are used to indicates a MergeTree position which is stable as operations are performed. There are two
|
|
4
|
-
types:
|
|
5
|
-
|
|
6
|
-
1. LocalReferences refer to a segment and offset within that segment
|
|
7
|
-
2. Markers are actual segments in the Merge Tree
|
|
8
|
-
|
|
9
|
-
The function `Client.localReferencePositionToPosition` returns the numerical position of a reference in the client's
|
|
10
|
-
current view.
|
|
11
|
-
|
|
12
|
-
## LocalReference behavior on Remove
|
|
13
|
-
|
|
14
|
-
By default, LocalReferences become detached when the segment they reference is removed.
|
|
15
|
-
The ReferenceTypes SlideOnRemove, StayOnRemove, and Transient change this behavior.
|
|
16
|
-
They are only valid for LocalReferences.
|
|
17
|
-
They are exclusive - a reference may be at most one of these types.
|
|
18
|
-
|
|
19
|
-
### SlideOnRemove
|
|
20
|
-
|
|
21
|
-
The reference will slide to the next farthest segment when the segment is removed and the remove has been acknowledged.
|
|
22
|
-
Sliding will look for the next valid segment.
|
|
23
|
-
A valid segment is one whose creation has been acknowledged and either hasn't been removed
|
|
24
|
-
or the remove is pending (not acknowledged).
|
|
25
|
-
If a farther segment is found, then the LocalReference will be changed to refer to that segment and have offset 0.
|
|
26
|
-
In the event that the slide is happening on the acknowledgement of a remove, the slide to a farther segment will not
|
|
27
|
-
change the numerical position of the reference.
|
|
28
|
-
If there is no there is no valid segment farther in the tree, then the slide will place the reference on the last valid segment.
|
|
29
|
-
The offset will be set to the last position in that segment.
|
|
30
|
-
In the event that the slide is happening on the acknowledgement of a remove, the reference would have been on the removed
|
|
31
|
-
segment. This slide from the removed segment to a nearer segment does change the numerical position of the reference.
|
|
32
|
-
If there is no valid position (all segments removed and acknowledged) then the reference is detached.
|
|
33
|
-
|
|
34
|
-
### StayOnRemove
|
|
35
|
-
|
|
36
|
-
The reference will stay on removed segments.
|
|
37
|
-
This behavior is only defined until the removed segment is cleaned up by Zamboni.
|
|
38
|
-
This is intended to be used only while collaborating (see below) while waiting for an acknowledgement.
|
|
39
|
-
|
|
40
|
-
### Transient
|
|
41
|
-
|
|
42
|
-
The reference is not tracked by the MergeTree.
|
|
43
|
-
It will continue to reference removed segments.
|
|
44
|
-
This behavior is only defined until the removed segment is cleaned up by Zamboni.
|
|
45
|
-
This is intended to be used to create transient references which may be compared with other references.
|
|
46
|
-
|
|
47
|
-
### Detached LocalReferences
|
|
48
|
-
|
|
49
|
-
A detached LocalReference does not reference a segment in the MergeTree.
|
|
50
|
-
It's position is defined to be `LocalReference.DetachedPosition` (-1).
|
|
51
|
-
|
|
52
|
-
### LocalReferences on Removed Segments
|
|
53
|
-
|
|
54
|
-
LocalReferences may reference removed segments:
|
|
55
|
-
|
|
56
|
-
- SlideOnRemove references may reference a removed segment which is pending (not acknowledged)
|
|
57
|
-
- StayOnRemove references may reference removed segments
|
|
58
|
-
- Transient references may reference removed segments
|
|
59
|
-
|
|
60
|
-
The numerical position of a reference which is on a removed segment will be one more than the previous (nearer) segment.
|
|
61
|
-
If there is a farther segment that is not removed, this will be the same as the position of the start of that segment.
|
|
62
|
-
If there is no farther segment, then the reference position will be the length of the tree (one more than the last valid
|
|
63
|
-
position in the tree).
|
|
64
|
-
|
|
65
|
-
## Eventually Consistent References
|
|
66
|
-
|
|
67
|
-
Markers are segments in the MergeTree and are eventually consistent.
|
|
68
|
-
LocalReferences may be used as part of an eventually consistent feature.
|
|
69
|
-
For example, SharedIntervals are built using LocalReferences.
|
|
70
|
-
|
|
71
|
-
### Implementing Eventually Consistent LocalReferences
|
|
72
|
-
|
|
73
|
-
To implement an operation which creates LocalReferences which will have an eventually consistent position:
|
|
74
|
-
|
|
75
|
-
1. Locally create the reference as StayOnRemove
|
|
76
|
-
2. Send the reference numerical position in an op
|
|
77
|
-
3. On acknowledgement of the local create:
|
|
78
|
-
1. set the `refType` of the reference to include `SlideOnRemove`
|
|
79
|
-
2. call `Client.getSlideToSegment` with the references current segment and offset to get the proper new location
|
|
80
|
-
3. Delete the old reference and create a new one with the returned values
|
|
81
|
-
4. Remote clients, on receiving the op, call `Client.getContainingSegment` followed by `Client.getSlideToSegment`
|
|
82
|
-
on the result. Call `Client.createLocalReferencePosition` with the result to create a `SlideOnRemove` reference.
|
|
83
|
-
5. If there is a dependency on the comparison of reference positions (such as the index in IntervalCollections)
|
|
84
|
-
must listen to the `beforeSlide` and `afterSlide` events on `IReferencePositionEvents`. When slide occurs the
|
|
85
|
-
relative position of references may have changed.
|
|
86
|
-
|
|
87
|
-
### Implementation Notes
|
|
88
|
-
|
|
89
|
-
This is the state diagram for the implementation of Eventually Consistent References.
|
|
90
|
-
|
|
91
|
-
```mermaid
|
|
92
|
-
flowchart LR
|
|
93
|
-
subgraph StayOnRemove
|
|
94
|
-
localCreate[Local Create Ref]
|
|
95
|
-
pendingRef(("Ref:StayOnRemove\nSegment:Pending|Normal"))
|
|
96
|
-
pendingRefPendingRemove((Ref:StayOnRemove\nSegment:Pending Remove))
|
|
97
|
-
pendingRefRemoved((Ref:StayOnRemove\nSegment:Removed))
|
|
98
|
-
localCreate-->pendingRef
|
|
99
|
-
pendingRef--local remove-->pendingRefPendingRemove
|
|
100
|
-
pendingRefPendingRemove--remote remove-->pendingRefRemoved
|
|
101
|
-
pendingRef--remote remove-->pendingRefRemoved
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
subgraph SlideOnRemove
|
|
105
|
-
remoteCreate[Remote Create Ref]
|
|
106
|
-
remoteChoice{Segment Removed?}
|
|
107
|
-
ref((Ref:SlideOnRemove\nSegment:Normal))
|
|
108
|
-
refPendingRemove((Ref:SlideOnRemove\nSegment:Pending Remove))
|
|
109
|
-
|
|
110
|
-
remoteCreate-->remoteChoice
|
|
111
|
-
remoteChoice--no-->ref
|
|
112
|
-
remoteChoice--locally-->refPendingRemove
|
|
113
|
-
ref--local remove-->refPendingRemove
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
slide{Slide To}
|
|
117
|
-
detached((Ref:Detached))
|
|
118
|
-
|
|
119
|
-
pendingRef--create ack-->ref
|
|
120
|
-
pendingRefPendingRemove--create ack-->refPendingRemove
|
|
121
|
-
pendingRefRemoved--create ack-->slide
|
|
122
|
-
remoteChoice--yes-->slide
|
|
123
|
-
ref--remote remove-->slide
|
|
124
|
-
refPendingRemove--remove ack-->slide
|
|
125
|
-
refPendingRemove--remote remove-->slide
|
|
126
|
-
slide--segment-->ref
|
|
127
|
-
slide--locally removed segment-->refPendingRemove
|
|
128
|
-
slide--no segment-->detached
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
This algorithm works because it ensures that slid reference slide to the same segment.
|
|
132
|
-
The slide only happens when both the creation of the reference and removal of the segment have been acknowledged.
|
|
133
|
-
When sliding we do not consider any local (unacknowledged) ops.
|
|
134
|
-
|
|
135
|
-
Keeping references on removed segments until they can be slid works well in most cases because of these properties:
|
|
136
|
-
|
|
137
|
-
1. Interval positions on removed segments appear as if they were on the following position in the string.
|
|
138
|
-
If the removed segment is between positions 5 and 6, the interval positions on the removed segment appear to be at
|
|
139
|
-
position 6. This matches where they will eventually slide, so slide will not cause a change in position as long as
|
|
140
|
-
segments are not slid over and it is not necessary to slide to the near end of the string.
|
|
141
|
-
2. Text inserted at the same location as the removed segment is inserted before the removed segment.
|
|
142
|
-
So if the removed segment is between 0 and 1 (“A[removed]B”), insertText(1, “X”) inserts before the removed segment
|
|
143
|
-
(“AX[removed]B”). This makes it hard to end up with local only segments to be slid over, which will mean it is rare
|
|
144
|
-
that slide visibly changes the interval position. It can still happen if there is a conflicting remove, but that is
|
|
145
|
-
much less likely.
|
|
146
|
-
|
|
147
|
-
#### Conflict Scenarios
|
|
148
|
-
|
|
149
|
-
Considering Create Interval / Remove Range conflicts, here are the scenarios
|
|
150
|
-
(before indicates the relative sequence order):
|
|
151
|
-
|
|
152
|
-
1. Local create before local remove. Interval position needs to slide on ack of the local remove.
|
|
153
|
-
2. Remote create before remote remove. Slide on receiving the remove.
|
|
154
|
-
3. Local remove before local create. This is impossible – once the segment is removed locally an
|
|
155
|
-
interval position can’t be created on it.
|
|
156
|
-
4. Remote remove before remote create. (Possible if ops are from different remote clients).
|
|
157
|
-
Slide on receiving the remote create.
|
|
158
|
-
5. Remote create before local remove. Slide on the ack of the local remove.
|
|
159
|
-
6. Local create before remote remove. Slide on receiving the remove.
|
|
160
|
-
7. Local remove before remote create. Slide on receiving the create.
|
|
161
|
-
8. Remote remove before local create. Slide on receiving the ack of the create.
|
|
162
|
-
|
|
163
|
-
### Why Eventually Consistent References Can Not Have Stable Order
|
|
164
|
-
|
|
165
|
-
In an ideal system reference positions would have stable order. Specifically:
|
|
166
|
-
|
|
167
|
-
1. If in any client state the position of a reference is less than the position of a specific item in the sequence,
|
|
168
|
-
then the position of that reference would always be less than or equal to the position of that item.
|
|
169
|
-
2. If in any client state the position of reference A is less than the position of reference B,
|
|
170
|
-
then the position of reference A would always be less than or equal to the position of reference B.
|
|
171
|
-
|
|
172
|
-
Neither of these properties is true for SlideOnRemove references. This is a result of them sliding over local
|
|
173
|
-
only segments. This could change the relative positions of the sliding reference at items that are slide over,
|
|
174
|
-
as well as any references on those items. Note that these properties do hold for items and references
|
|
175
|
-
once the creation has been acknowledged (sequenced by the server).
|
|
176
|
-
|
|
177
|
-
Supporting stable order is not possible in the current system because:
|
|
178
|
-
|
|
179
|
-
1. Removing a range may cause an multiple references that had been at different positions to all be at the same
|
|
180
|
-
position.
|
|
181
|
-
2. To preserve stable ordering, an insert that conflicts with that remove would need to be after some of those
|
|
182
|
-
references and before others.
|
|
183
|
-
3. Insertion position is specified as a numerical offset in the sequence, so can't specify where in the set
|
|
184
|
-
of references at a position to be inserted. (Technically there is enough information to do this within the
|
|
185
|
-
collab window. But that information is lost if reconnect/resubmit is required.)
|
|
186
|
-
|
|
187
|
-
Therefore implementing eventually consistent references with stable order would require adding additional
|
|
188
|
-
information to insert ops.
|
|
189
|
-
|
|
190
|
-
## Tests
|
|
191
|
-
|
|
192
|
-
- `packages\dds\merge-tree\src\test\client.localReference.spec.ts`
|
|
193
|
-
unit tests for LocalReferences
|
|
194
|
-
- `packages\dds\sequence\src\test\intervalCollection.spec.ts`
|
|
195
|
-
test LocalReferences as used in interval collections (including eventual consistency)
|
|
196
|
-
- `packages\test\test-end-to-end-tests\src\test\sharedInterval.spec.ts`
|
|
197
|
-
end-to-end tests using LocalReferences for interval collections.
|
|
198
|
-
These tests have only been minimally updated to reflect this implementation,
|
|
199
|
-
so they do not comprehensively test LocalReferences.
|