@knaw-huc/text-annotation-segmenter 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -18
- package/dist/GetId.d.ts +1 -0
- package/dist/GetOffsets.d.ts +2 -0
- package/dist/Model.d.ts +4 -1
- package/dist/findSegmentOffsets.d.ts +13 -0
- package/dist/groupSegments.d.ts +17 -4
- package/dist/index.d.ts +4 -3
- package/dist/index.js +45 -20
- package/dist/segment.d.ts +4 -3
- package/package.json +1 -1
- package/dist/mapAnnotationSegmentRanges.d.ts +0 -12
package/README.md
CHANGED
|
@@ -15,17 +15,18 @@ A special case is the marker: an annotation of zero width marking a position in
|
|
|
15
15
|
|
|
16
16
|
## API
|
|
17
17
|
|
|
18
|
-
- [`segment<T>(text, annotations): TextSegment<T>[]`](./src/segment.ts) <br /> Split a text into TextSegments with char offsets to the text and a list of applying annotations.
|
|
19
|
-
- [`
|
|
20
|
-
- [`
|
|
21
|
-
- [`
|
|
22
|
-
- [`
|
|
23
|
-
- [`
|
|
24
|
-
- [`SegmentRange`](./src/mapAnnotationSegmentRanges.ts) <br /> Start and end segment index of an annotation (end excluding).
|
|
18
|
+
- [`segment<T>(text, annotations, getOffsets): TextSegment<T>[]`](./src/segment.ts) <br /> Split a text into TextSegments with char offsets to the text and a list of applying annotations.
|
|
19
|
+
- [`TextSegment<T>`](./src/Model.ts) <br /> Start and end offsets of text segment (excluding last character), plus the annotations that apply.
|
|
20
|
+
- [`groupSegments<T>(segments, isGroup, getId): SegmentGroup<T>[]`](src/groupSegments.ts) <br /> Recursively group segments into higher-level units (e.g., words, paragraphs, divs) by collecting all segments that share a matching annotation.
|
|
21
|
+
- [`SegmentGroup<T>`](src/groupSegments.ts) <br /> A `Group<T>` (with annotation and recursive children) or `Ungrouped<T>` (with plain segments).
|
|
22
|
+
- [`findSegmentOffsets<T>(segments): Map<T, SegmentOffsets>`](src/findSegmentOffsets.ts) <br /> For each annotation, collect the first and last segment index it appears in, keyed by object reference.
|
|
23
|
+
- [`SegmentOffsets`](src/findSegmentOffsets.ts) <br /> Start and end segment index of an annotation (excluding end segment).
|
|
25
24
|
|
|
26
25
|
## Example
|
|
27
26
|
|
|
28
|
-
|
|
27
|
+
### Create text segments
|
|
28
|
+
|
|
29
|
+
A text 'abc' with two overlapping annotations at 'ab' and 'bc' will be split up in three segments:
|
|
29
30
|
|
|
30
31
|
```txt
|
|
31
32
|
text: abc
|
|
@@ -34,24 +35,67 @@ annotation bc: __
|
|
|
34
35
|
```
|
|
35
36
|
|
|
36
37
|
```ts
|
|
37
|
-
import { segment } from "text-annotation-segmenter";
|
|
38
|
+
import { segment } from "@knaw-huc/text-annotation-segmenter";
|
|
38
39
|
|
|
39
40
|
const text = 'abc';
|
|
40
|
-
const ab = {id: 'ab'};
|
|
41
|
-
const bc = {id: 'bc'};
|
|
41
|
+
const ab = {id: 'ab', begin: 0, end: 2};
|
|
42
|
+
const bc = {id: 'bc', begin: 1, end: 3};
|
|
42
43
|
|
|
43
|
-
const segments = segment(text, [
|
|
44
|
-
{begin: 0, end: 2, body: ab},
|
|
45
|
-
{begin: 1, end: 3, body: bc},
|
|
46
|
-
]);
|
|
44
|
+
const segments = segment(text, [ab, bc], (a) => a);
|
|
47
45
|
|
|
48
46
|
expect(segments).toEqual([
|
|
49
|
-
{
|
|
50
|
-
{
|
|
51
|
-
{
|
|
47
|
+
{index: 0, begin: 0, end: 1, body: 'a', annotations: [ab]},
|
|
48
|
+
{index: 1, begin: 1, end: 2, body: 'ab', annotations: [ab, bc]},
|
|
49
|
+
{index: 2, begin: 2, end: 3, body: 'c', annotations: [bc]},
|
|
50
|
+
]);
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Group segments
|
|
54
|
+
|
|
55
|
+
We can use segments to build up a hierachy of elements:
|
|
56
|
+
|
|
57
|
+
Given the text 'ab' with a section spanning the whole text and a paragraph spanning the first half:
|
|
58
|
+
|
|
59
|
+
```txt
|
|
60
|
+
text: aabb
|
|
61
|
+
section: ____
|
|
62
|
+
paragraph: __
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
```ts
|
|
66
|
+
import {segment, groupSegments} from "@knaw-huc/text-annotation-segmenter";
|
|
67
|
+
|
|
68
|
+
const text = 'aabb';
|
|
69
|
+
const section = {id: 'section', type: 'section', begin: 0, end: 4};
|
|
70
|
+
const paragraph = {id: 'paragraph', type: 'paragraph', begin: 0, end: 2};
|
|
71
|
+
|
|
72
|
+
const segments = segment(text, [section, paragraph], (a) => a);
|
|
73
|
+
|
|
74
|
+
const isGroup = a => a.type === 'section' || a.type === 'paragraph';
|
|
75
|
+
const getId = a => a.id;
|
|
76
|
+
|
|
77
|
+
const groups = groupSegments(
|
|
78
|
+
segments,
|
|
79
|
+
isGroup,
|
|
80
|
+
getId,
|
|
81
|
+
);
|
|
82
|
+
|
|
83
|
+
expect(groups).toEqual([
|
|
84
|
+
{
|
|
85
|
+
isGroup: true, annotation: section, children: [
|
|
86
|
+
{
|
|
87
|
+
isGroup: true, annotation: paragraph, children: [
|
|
88
|
+
{isGroup: false, segments: [segments[0]]}
|
|
89
|
+
]
|
|
90
|
+
},
|
|
91
|
+
{isGroup: false, segments: [segments[1]]},
|
|
92
|
+
]
|
|
93
|
+
}
|
|
52
94
|
]);
|
|
53
95
|
```
|
|
54
96
|
|
|
97
|
+
---
|
|
98
|
+
|
|
55
99
|
More examples:
|
|
56
100
|
|
|
57
101
|
- For edge cases, see: [segment.spec.ts](./src/segment.spec.ts).
|
package/dist/GetId.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export type GetId<T> = (annotation: T) => string;
|
package/dist/Model.d.ts
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Start and end character, end excluding last character
|
|
3
|
+
*/
|
|
1
4
|
export type Offsets = {
|
|
2
5
|
begin: number;
|
|
3
6
|
end: number;
|
|
4
7
|
};
|
|
5
8
|
/**
|
|
6
|
-
* Output:
|
|
9
|
+
* Output: start and end offsets of text segment (excluding last character), plus the annotations that apply.
|
|
7
10
|
*/
|
|
8
11
|
export type TextSegment<T> = Offsets & {
|
|
9
12
|
index: SegmentIndex;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { TextSegment } from './Model';
|
|
2
|
+
export type SegmentOffsets = {
|
|
3
|
+
beginSegment: number;
|
|
4
|
+
/**
|
|
5
|
+
* Excluding last segment
|
|
6
|
+
*/
|
|
7
|
+
endSegment: number;
|
|
8
|
+
};
|
|
9
|
+
/**
|
|
10
|
+
* For each annotation, collect the first and last segment index
|
|
11
|
+
* it appears in, keyed by object reference.
|
|
12
|
+
*/
|
|
13
|
+
export declare function findSegmentOffsets<T>(segments: TextSegment<T>[]): Map<T, SegmentOffsets>;
|
package/dist/groupSegments.d.ts
CHANGED
|
@@ -1,10 +1,23 @@
|
|
|
1
|
-
import { TextSegment } from './Model
|
|
1
|
+
import { TextSegment } from './Model';
|
|
2
|
+
import { GetId } from './GetId.ts';
|
|
3
|
+
export type SegmentGroup<T> = Group<T> | Ungrouped<T>;
|
|
2
4
|
export type Group<T> = {
|
|
5
|
+
isGroup: true;
|
|
3
6
|
annotation: T;
|
|
7
|
+
children: SegmentGroup<T>[];
|
|
8
|
+
};
|
|
9
|
+
export type Ungrouped<T> = {
|
|
10
|
+
isGroup: false;
|
|
4
11
|
segments: TextSegment<T>[];
|
|
5
12
|
};
|
|
6
13
|
/**
|
|
7
|
-
*
|
|
8
|
-
*
|
|
14
|
+
* Recursively group a list of segments into higher-level units (e.g. words,
|
|
15
|
+
* paragraphs or divs) by segments that share a matching annotation.
|
|
16
|
+
*
|
|
17
|
+
* Example: text "aabb" with section (0–4) and paragraph (0–2) produces:
|
|
18
|
+
* [group(section): [group(paragraph("aa")), ungrouped("bb")]]
|
|
19
|
+
*
|
|
20
|
+
* Note: annotations are assumed to be ordered by nesting depth
|
|
21
|
+
* in each segment's annotations array, outer-first.
|
|
9
22
|
*/
|
|
10
|
-
export declare function groupSegments<T>(segments: TextSegment<T>[],
|
|
23
|
+
export declare function groupSegments<T>(segments: TextSegment<T>[], isGroup: (annotation: T) => boolean, getId: GetId<T>): SegmentGroup<T>[];
|
package/dist/index.d.ts
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
export { segment } from './segment.ts';
|
|
2
2
|
export type { AnnotationSegmentsByChar } from './segment.ts';
|
|
3
|
-
export {
|
|
4
|
-
export type {
|
|
3
|
+
export { findSegmentOffsets } from './findSegmentOffsets.ts';
|
|
4
|
+
export type { SegmentOffsets } from './findSegmentOffsets.ts';
|
|
5
5
|
export { groupSegments } from './groupSegments.ts';
|
|
6
|
-
export type { Group } from './groupSegments.ts';
|
|
6
|
+
export type { Group, SegmentGroup } from './groupSegments.ts';
|
|
7
7
|
export type { Offsets, TextSegment, SegmentIndex, } from './Model.ts';
|
|
8
|
+
export type { GetOffsets } from './GetOffsets.ts';
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
function segment(text, annotations) {
|
|
1
|
+
function segment(text, annotations, getOffsets) {
|
|
2
2
|
const segments = [];
|
|
3
3
|
let segmentCounter = -1;
|
|
4
4
|
const offsetMap = /* @__PURE__ */ new Map(), getOrCreateOffset = (charIndex) => {
|
|
@@ -7,10 +7,10 @@ function segment(text, annotations) {
|
|
|
7
7
|
};
|
|
8
8
|
getOrCreateOffset(0), getOrCreateOffset(text.length);
|
|
9
9
|
for (const annotation of annotations) {
|
|
10
|
-
const isMarker =
|
|
11
|
-
if (isMarker && (
|
|
10
|
+
const offsets = getOffsets(annotation), isMarker = offsets.begin === offsets.end;
|
|
11
|
+
if (isMarker && (offsets.end < 0 || offsets.begin > text.length) || !isMarker && (offsets.end <= 0 || offsets.begin >= text.length))
|
|
12
12
|
continue;
|
|
13
|
-
const begin = Math.max(0,
|
|
13
|
+
const begin = Math.max(0, offsets.begin), end = Math.min(text.length, offsets.end);
|
|
14
14
|
getOrCreateOffset(begin).starting.push(annotation), getOrCreateOffset(end).ending.push(annotation);
|
|
15
15
|
}
|
|
16
16
|
const sortedOffsets = [...offsetMap.values()].sort(
|
|
@@ -49,31 +49,56 @@ function segment(text, annotations) {
|
|
|
49
49
|
}
|
|
50
50
|
return segments;
|
|
51
51
|
}
|
|
52
|
-
function
|
|
53
|
-
const
|
|
52
|
+
function findSegmentOffsets(segments) {
|
|
53
|
+
const offsets = /* @__PURE__ */ new Map();
|
|
54
54
|
for (let i = 0; i < segments.length; i++)
|
|
55
55
|
for (const annotation of segments[i].annotations) {
|
|
56
|
-
const existing =
|
|
57
|
-
existing ? existing.endSegment = i + 1 :
|
|
56
|
+
const existing = offsets.get(annotation);
|
|
57
|
+
existing ? existing.endSegment = i + 1 : offsets.set(annotation, { beginSegment: i, endSegment: i + 1 });
|
|
58
58
|
}
|
|
59
|
-
return
|
|
59
|
+
return offsets;
|
|
60
60
|
}
|
|
61
|
-
function groupSegments(segments,
|
|
62
|
-
|
|
61
|
+
function groupSegments(segments, isGroup, getId) {
|
|
62
|
+
return group(segments, isGroup, getId, 0);
|
|
63
|
+
}
|
|
64
|
+
function group(segments, isGroup, getId, depth) {
|
|
65
|
+
return groupByAnnotation(segments, isGroup, getId, depth).map(
|
|
66
|
+
(group2) => group2.annotation ? createGroup(group2.annotation, group2.segments, isGroup, getId, depth) : createUngrouped(group2.segments)
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
function groupByAnnotation(segments, isGroup, getId, depth) {
|
|
70
|
+
const groups = [];
|
|
71
|
+
let currentId, currentGroup;
|
|
63
72
|
for (const segment2 of segments) {
|
|
64
|
-
const
|
|
65
|
-
|
|
66
|
-
let list = groups.get(match);
|
|
67
|
-
list || (list = [], groups.set(match, list)), list.push(segment2);
|
|
68
|
-
}
|
|
73
|
+
const annotation = getGroupAnnotation(segment2, isGroup, depth), id = annotation ? getId(annotation) : void 0;
|
|
74
|
+
!currentGroup || id !== currentId ? (currentGroup = annotation ? { annotation, segments: [segment2] } : { segments: [segment2] }, groups.push(currentGroup), currentId = id) : currentGroup.segments.push(segment2);
|
|
69
75
|
}
|
|
70
|
-
return
|
|
76
|
+
return groups;
|
|
77
|
+
}
|
|
78
|
+
function getGroupAnnotation(segment2, isGroup, depth) {
|
|
79
|
+
let groupIndex = 0;
|
|
80
|
+
for (const annotation of segment2.annotations)
|
|
81
|
+
if (isGroup(annotation)) {
|
|
82
|
+
if (groupIndex === depth)
|
|
83
|
+
return annotation;
|
|
84
|
+
groupIndex++;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
function createGroup(annotation, segments, isGroup, getId, depth) {
|
|
88
|
+
return {
|
|
89
|
+
isGroup: !0,
|
|
71
90
|
annotation,
|
|
72
|
-
|
|
73
|
-
}
|
|
91
|
+
children: group(segments, isGroup, getId, depth + 1)
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
function createUngrouped(segments) {
|
|
95
|
+
return {
|
|
96
|
+
isGroup: !1,
|
|
97
|
+
segments
|
|
98
|
+
};
|
|
74
99
|
}
|
|
75
100
|
export {
|
|
101
|
+
findSegmentOffsets,
|
|
76
102
|
groupSegments,
|
|
77
|
-
mapAnnotationSegmentRanges,
|
|
78
103
|
segment
|
|
79
104
|
};
|
package/dist/segment.d.ts
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { TextSegment } from './Model';
|
|
2
|
+
import { GetOffsets } from './GetOffsets.ts';
|
|
2
3
|
/**
|
|
3
4
|
* Split a text into {@link TextSegment}s with character offsets and a list of applying annotations.
|
|
4
5
|
*/
|
|
5
|
-
export declare function segment<T
|
|
6
|
-
export type AnnotationSegmentsByChar<T
|
|
6
|
+
export declare function segment<T>(text: string, annotations: T[], getOffsets: GetOffsets<T>): TextSegment<T>[];
|
|
7
|
+
export type AnnotationSegmentsByChar<T> = {
|
|
7
8
|
charIndex: number;
|
|
8
9
|
starting: T[];
|
|
9
10
|
ending: T[];
|
package/package.json
CHANGED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import { TextSegment } from './Model';
|
|
2
|
-
export type SegmentRange = {
|
|
3
|
-
startSegment: number;
|
|
4
|
-
/**
|
|
5
|
-
* Excluding last segment
|
|
6
|
-
*/
|
|
7
|
-
endSegment: number;
|
|
8
|
-
};
|
|
9
|
-
/**
|
|
10
|
-
* For each annotation, collect the first and last segment index it appears in, keyed by object reference.
|
|
11
|
-
*/
|
|
12
|
-
export declare function mapAnnotationSegmentRanges<T>(segments: TextSegment<T>[]): Map<T, SegmentRange>;
|