@knaw-huc/text-annotation-segmenter 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -19
- package/dist/GetOffsets.d.ts +2 -2
- package/dist/Model.d.ts +12 -6
- package/dist/findSegmentRange.d.ts +6 -0
- package/dist/index.d.ts +2 -4
- package/dist/index.js +19 -22
- package/dist/segment.d.ts +3 -13
- package/package.json +1 -1
- package/dist/GetMarkerPosition.d.ts +0 -2
- package/dist/findSegmentOffsets.d.ts +0 -13
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
[](https://www.npmjs.com/package/@knaw-huc/text-annotation-segmenter)
|
|
4
4
|
|
|
5
|
-
Utility functions to render annotations with character offsets in a text.
|
|
5
|
+
Utility functions to render annotations with character position offsets in a text.
|
|
6
6
|
|
|
7
7
|
Annotations on a text have a non-hierarchical nature, i.e., they can overlap:
|
|
8
8
|
```text
|
|
@@ -24,16 +24,16 @@ The `segment` function creates an array of segments: a flat, non-overlapping lis
|
|
|
24
24
|
|
|
25
25
|
### Functions
|
|
26
26
|
|
|
27
|
-
- [`segment<T>(text, annotations, getOffsets, getMarkerPosition?): TextSegment<T>[]`](./src/segment.ts) <br /> Split a text into TextSegments with character
|
|
27
|
+
- [`segment<T>(text, annotations, getOffsets, getMarkerPosition?): TextSegment<T>[]`](./src/segment.ts) <br /> Split a text into TextSegments with character positions and a list of applying annotations.
|
|
28
28
|
- [`groupSegments<T>(segments, isGroup, getId): SegmentGroup<T>[]`](src/groupSegments.ts) <br /> Recursively group segments into higher-level units (e.g., words, paragraphs, divs) by collecting all segments that share a matching annotation.
|
|
29
29
|
- [`collectGroupSegments<T>(group): TextSegment<T>[]`](src/groupSegments.ts) <br /> Recursively collect all TextSegments from a Group.
|
|
30
|
-
- [`
|
|
30
|
+
- [`findSegmentRange<T>(segments): Map<T, SegmentRange>`](src/findSegmentRange.ts) <br /> For each annotation, collect the first and last segment index it appears in, keyed by object reference.
|
|
31
31
|
|
|
32
32
|
### Types
|
|
33
33
|
|
|
34
34
|
- [`SegmentGroup<T>`](src/groupSegments.ts) <br /> A `Group<T>` (with annotation and recursive children) or `Ungrouped<T>` (with plain segments).
|
|
35
|
-
- [`
|
|
36
|
-
- [`TextSegment<T>`](./src/Model.ts) <br /> Start and end
|
|
35
|
+
- [`SegmentRange`](src/findSegmentRange.ts) <br /> Start and end segment index of an annotation (excluding last segment).
|
|
36
|
+
- [`TextSegment<T>`](./src/Model.ts) <br /> Start and end positions of text segment (excluding last character), plus the annotations that apply.
|
|
37
37
|
|
|
38
38
|
## Examples
|
|
39
39
|
|
|
@@ -51,17 +51,17 @@ annotation bc: __
|
|
|
51
51
|
import {segment} from "@knaw-huc/text-annotation-segmenter";
|
|
52
52
|
|
|
53
53
|
const text = 'abc';
|
|
54
|
-
const ab = {id: 'ab',
|
|
55
|
-
const bc = {id: 'bc',
|
|
54
|
+
const ab = {id: 'ab', start: 0, end: 2};
|
|
55
|
+
const bc = {id: 'bc', start: 1, end: 3};
|
|
56
56
|
|
|
57
57
|
const getOffsets = annotation => annotation;
|
|
58
58
|
|
|
59
59
|
const segments = segment(text, [ab, bc], getOffsets);
|
|
60
60
|
|
|
61
61
|
expect(segments).toEqual([
|
|
62
|
-
{index: 0,
|
|
63
|
-
{index: 1,
|
|
64
|
-
{index: 2,
|
|
62
|
+
{index: 0, start: 0, end: 1, value: 'a', annotations: [ab]},
|
|
63
|
+
{index: 1, start: 1, end: 2, value: 'b', annotations: [ab, bc]},
|
|
64
|
+
{index: 2, start: 2, end: 3, value: 'c', annotations: [bc]},
|
|
65
65
|
]);
|
|
66
66
|
```
|
|
67
67
|
- More examples: [segment.spec.ts](./src/segment.spec.ts).
|
|
@@ -83,8 +83,8 @@ paragraph: __
|
|
|
83
83
|
import {segment, groupSegments} from "@knaw-huc/text-annotation-segmenter";
|
|
84
84
|
|
|
85
85
|
const text = 'aabb';
|
|
86
|
-
const section = {id: 'section', type: 'section',
|
|
87
|
-
const paragraph = {id: 'paragraph', type: 'paragraph',
|
|
86
|
+
const section = {id: 'section', type: 'section', start: 0, end: 4};
|
|
87
|
+
const paragraph = {id: 'paragraph', type: 'paragraph', start: 0, end: 2};
|
|
88
88
|
|
|
89
89
|
const getOffsets = annotation => annotation;
|
|
90
90
|
const segments = segment(text, [section, paragraph], getOffsets);
|
|
@@ -125,19 +125,19 @@ annotation b: _
|
|
|
125
125
|
|
|
126
126
|
```ts
|
|
127
127
|
const text = 'ab';
|
|
128
|
-
const marker: Annotation = {
|
|
129
|
-
const a: Annotation = {
|
|
130
|
-
const b: Annotation = {
|
|
128
|
+
const marker: Annotation = {start: 1, end: 1, id: 'm'};
|
|
129
|
+
const a: Annotation = {start: 0, end: 1, id: 'a'};
|
|
130
|
+
const b: Annotation = {start: 1, end: 2, id: 'b'};
|
|
131
131
|
|
|
132
132
|
const segments = segment(text, [marker, a, b], getOffsets);
|
|
133
133
|
const markerSegment = segments[1];
|
|
134
134
|
expect(markerSegment).toEqual(
|
|
135
|
-
{index: 1,
|
|
135
|
+
{index: 1, start: 1, end: 1, value: '', annotations: [marker, a]},
|
|
136
136
|
);
|
|
137
137
|
```
|
|
138
|
-
Notice how the marker is 'postfixed' to the previous annotation. When the marker shares its character index with the start and end
|
|
138
|
+
Notice how the marker is 'postfixed' to the previous annotation. When the marker shares its character index with the start and end positions of annotations, by default the marker segment will only include the annotations that end at that position (e.g: a note at the end of a paragraph).
|
|
139
139
|
|
|
140
|
-
However, markers can also be prefixed to annotations that share their start
|
|
140
|
+
However, markers can also be prefixed to annotations that share their start position with the marker, using the `getMarkerPosition` parameter of the `segment` function:
|
|
141
141
|
|
|
142
142
|
```ts
|
|
143
143
|
const getMarkerPosition = a => a.id === 'm' ? 'prefix' : 'postfix'
|
|
@@ -145,7 +145,7 @@ const segments = segment(text, [marker, a, b], getOffsets, getMarkerPosition);
|
|
|
145
145
|
|
|
146
146
|
const markerSegment = segments[1];
|
|
147
147
|
expect(markerSegment).toEqual(
|
|
148
|
-
{index: 1,
|
|
148
|
+
{index: 1, start: 1, end: 1, value: '', annotations: [marker, b]},
|
|
149
149
|
);
|
|
150
150
|
```
|
|
151
151
|
More examples: [segment.spec.ts](./src/segment.spec.ts#L118).
|
package/dist/GetOffsets.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import {
|
|
2
|
-
export type GetOffsets<T> = (annotation: T) =>
|
|
1
|
+
import { TextPosition } from './Model.ts';
|
|
2
|
+
export type GetOffsets<T> = (annotation: T) => TextPosition;
|
package/dist/Model.d.ts
CHANGED
|
@@ -1,17 +1,23 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Start and end character, end excluding last character
|
|
3
3
|
*/
|
|
4
|
-
export type
|
|
5
|
-
|
|
4
|
+
export type TextPosition = {
|
|
5
|
+
start: number;
|
|
6
6
|
end: number;
|
|
7
7
|
};
|
|
8
8
|
/**
|
|
9
|
-
* Output:
|
|
9
|
+
* Output: the character range and sliced text of a text segment, plus all annotations that apply.
|
|
10
10
|
*/
|
|
11
|
-
export type TextSegment<T> =
|
|
11
|
+
export type TextSegment<T> = TextPosition & {
|
|
12
12
|
index: SegmentIndex;
|
|
13
|
-
|
|
13
|
+
value: string;
|
|
14
14
|
annotations: T[];
|
|
15
15
|
};
|
|
16
16
|
export type SegmentIndex = number;
|
|
17
|
-
|
|
17
|
+
/**
|
|
18
|
+
* Start and end segment index of an annotation, excluding last segment
|
|
19
|
+
*/
|
|
20
|
+
export type SegmentRange = {
|
|
21
|
+
startSegment: number;
|
|
22
|
+
endSegment: number;
|
|
23
|
+
};
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { SegmentRange, TextSegment } from './Model';
|
|
2
|
+
/**
|
|
3
|
+
* For each annotation, collect the first and last segment index it appears in,
|
|
4
|
+
* keyed by object reference.
|
|
5
|
+
*/
|
|
6
|
+
export declare function findSegmentRange<T>(segments: TextSegment<T>[]): Map<T, SegmentRange>;
|
package/dist/index.d.ts
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
export { segment } from './segment.ts';
|
|
2
2
|
export type { AnnotationSegmentsByChar } from './segment.ts';
|
|
3
|
-
export {
|
|
4
|
-
export type { SegmentOffsets } from './findSegmentOffsets.ts';
|
|
3
|
+
export { findSegmentRange } from './findSegmentRange.ts';
|
|
5
4
|
export { groupSegments } from './groupSegments.ts';
|
|
6
5
|
export { collectGroupSegments } from './collectGroupSegments.ts';
|
|
7
6
|
export type { Group, SegmentGroup } from './groupSegments.ts';
|
|
8
|
-
export type {
|
|
7
|
+
export type { TextPosition, TextSegment, SegmentIndex, SegmentRange } from './Model.ts';
|
|
9
8
|
export type { GetOffsets } from './GetOffsets.ts';
|
|
10
|
-
export type { GetMarkerPosition } from './GetMarkerPosition.ts';
|
package/dist/index.js
CHANGED
|
@@ -1,22 +1,16 @@
|
|
|
1
|
-
function segment(text, annotations, getOffsets
|
|
1
|
+
function segment(text, annotations, getOffsets) {
|
|
2
2
|
const segments = [];
|
|
3
3
|
let segmentCounter = -1;
|
|
4
4
|
const offsetMap = /* @__PURE__ */ new Map(), getOrCreateOffset = (charIndex) => {
|
|
5
5
|
let offset = offsetMap.get(charIndex);
|
|
6
6
|
return offset || (offset = { charIndex, starting: [], ending: [] }, offsetMap.set(charIndex, offset)), offset;
|
|
7
|
-
}
|
|
8
|
-
index: ++segmentCounter,
|
|
9
|
-
begin: charIndex,
|
|
10
|
-
end: charIndex,
|
|
11
|
-
body: "",
|
|
12
|
-
annotations: [...markers, ...activeAnnotations]
|
|
13
|
-
});
|
|
7
|
+
};
|
|
14
8
|
getOrCreateOffset(0), getOrCreateOffset(text.length);
|
|
15
9
|
for (const annotation of annotations) {
|
|
16
|
-
const offsets = getOffsets(annotation), isMarker = offsets.
|
|
17
|
-
if (isMarker && (offsets.end < 0 || offsets.
|
|
10
|
+
const offsets = getOffsets(annotation), isMarker = offsets.start === offsets.end;
|
|
11
|
+
if (isMarker && (offsets.end < 0 || offsets.start > text.length) || !isMarker && (offsets.end <= 0 || offsets.start >= text.length))
|
|
18
12
|
continue;
|
|
19
|
-
const begin = Math.max(0, offsets.
|
|
13
|
+
const begin = Math.max(0, offsets.start), end = Math.min(text.length, offsets.end);
|
|
20
14
|
getOrCreateOffset(begin).starting.push(annotation), getOrCreateOffset(end).ending.push(annotation);
|
|
21
15
|
}
|
|
22
16
|
const sortedOffsets = [...offsetMap.values()].sort(
|
|
@@ -28,31 +22,34 @@ function segment(text, annotations, getOffsets, getMarkerPosition = () => "postf
|
|
|
28
22
|
const index = ++segmentCounter;
|
|
29
23
|
segments.push({
|
|
30
24
|
index,
|
|
31
|
-
|
|
25
|
+
start: lastOffset,
|
|
32
26
|
end: offset.charIndex,
|
|
33
|
-
|
|
27
|
+
value: text.slice(lastOffset, offset.charIndex),
|
|
34
28
|
annotations: [...activeAnnotations]
|
|
35
29
|
});
|
|
36
30
|
}
|
|
37
31
|
lastOffset = offset.charIndex;
|
|
38
|
-
const
|
|
39
|
-
for (const annotation of offset.starting)
|
|
40
|
-
offset.ending.includes(annotation) && (getMarkerPosition(annotation) === "prefix" ? prefixMarkers.push(annotation) : postfixMarkers.push(annotation));
|
|
41
|
-
postfixMarkers.length && segments.push(createMarkerSegment(postfixMarkers, offset.charIndex));
|
|
32
|
+
const markers = [];
|
|
42
33
|
for (const annotation of offset.starting)
|
|
43
|
-
offset.ending.includes(annotation)
|
|
34
|
+
offset.ending.includes(annotation) ? markers.push(annotation) : activeAnnotations.add(annotation);
|
|
35
|
+
markers.length && segments.push({
|
|
36
|
+
index: ++segmentCounter,
|
|
37
|
+
start: offset.charIndex,
|
|
38
|
+
end: offset.charIndex,
|
|
39
|
+
value: "",
|
|
40
|
+
annotations: [...markers, ...activeAnnotations]
|
|
41
|
+
});
|
|
44
42
|
for (const annotation of offset.ending)
|
|
45
43
|
activeAnnotations.delete(annotation);
|
|
46
|
-
prefixMarkers.length && segments.push(createMarkerSegment(prefixMarkers, offset.charIndex));
|
|
47
44
|
}
|
|
48
45
|
return segments;
|
|
49
46
|
}
|
|
50
|
-
function
|
|
47
|
+
function findSegmentRange(segments) {
|
|
51
48
|
const offsets = /* @__PURE__ */ new Map();
|
|
52
49
|
for (let i = 0; i < segments.length; i++)
|
|
53
50
|
for (const annotation of segments[i].annotations) {
|
|
54
51
|
const existing = offsets.get(annotation);
|
|
55
|
-
existing ? existing.endSegment = i + 1 : offsets.set(annotation, {
|
|
52
|
+
existing ? existing.endSegment = i + 1 : offsets.set(annotation, { startSegment: i, endSegment: i + 1 });
|
|
56
53
|
}
|
|
57
54
|
return offsets;
|
|
58
55
|
}
|
|
@@ -102,7 +99,7 @@ function collectGroupSegments(group2) {
|
|
|
102
99
|
}
|
|
103
100
|
export {
|
|
104
101
|
collectGroupSegments,
|
|
105
|
-
|
|
102
|
+
findSegmentRange,
|
|
106
103
|
groupSegments,
|
|
107
104
|
segment
|
|
108
105
|
};
|
package/dist/segment.d.ts
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { TextSegment } from './Model';
|
|
2
2
|
import { GetOffsets } from './GetOffsets.ts';
|
|
3
|
-
import { GetMarkerPosition } from './GetMarkerPosition.ts';
|
|
4
3
|
/**
|
|
5
4
|
* Split a text into {@link TextSegment}s with character offsets and a list of applying annotations.
|
|
6
5
|
*
|
|
@@ -10,19 +9,10 @@ import { GetMarkerPosition } from './GetMarkerPosition.ts';
|
|
|
10
9
|
*
|
|
11
10
|
* @param getOffsets Find the character begin and end index of an annotation
|
|
12
11
|
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
* - 'postfix': include annotations that end at marker offset
|
|
16
|
-
*
|
|
17
|
-
* For example, consider two paragraphs: <p1>aa</p1><marker/><p2>bb</p2>
|
|
18
|
-
* The annotation <marker/> shares the end character index with annotation <p1>,
|
|
19
|
-
* and the begin character index with annotation <p2>:
|
|
20
|
-
* - with 'postfix': segment (2,2) will contain annotations [marker, p1]
|
|
21
|
-
* - with 'prefix': segment (2,2) will contain annotations [marker, p2]
|
|
22
|
-
*
|
|
23
|
-
* By default, markers are postfixed.
|
|
12
|
+
* Marker segments include all annotations present at that position:
|
|
13
|
+
* ending, spanning, and starting. Consumers can filter these as needed.
|
|
24
14
|
*/
|
|
25
|
-
export declare function segment<T>(text: string, annotations: T[], getOffsets: GetOffsets<T
|
|
15
|
+
export declare function segment<T>(text: string, annotations: T[], getOffsets: GetOffsets<T>): TextSegment<T>[];
|
|
26
16
|
export type AnnotationSegmentsByChar<T> = {
|
|
27
17
|
charIndex: number;
|
|
28
18
|
starting: T[];
|
package/package.json
CHANGED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
import { TextSegment } from './Model';
|
|
2
|
-
export type SegmentOffsets = {
|
|
3
|
-
beginSegment: number;
|
|
4
|
-
/**
|
|
5
|
-
* Excluding last segment
|
|
6
|
-
*/
|
|
7
|
-
endSegment: number;
|
|
8
|
-
};
|
|
9
|
-
/**
|
|
10
|
-
* For each annotation, collect the first and last segment index
|
|
11
|
-
* it appears in, keyed by object reference.
|
|
12
|
-
*/
|
|
13
|
-
export declare function findSegmentOffsets<T>(segments: TextSegment<T>[]): Map<T, SegmentOffsets>;
|