@knaw-huc/text-annotation-segmenter 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.html +41 -0
- package/README.md +56 -0
- package/dist/Model.d.ts +18 -0
- package/dist/groupSegments.d.ts +10 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +71 -0
- package/dist/orThrow.d.ts +1 -0
- package/dist/segment.d.ts +10 -0
- package/package.json +35 -0
package/README.html
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
<h1 id="knaw-huctext-annotation-segmenter"><span class="citation" data-cites="knaw-huc/text-annotation-segmenter">@knaw-huc/text-annotation-segmenter</span></h1>
|
|
2
|
+
<p>Utility functions to help render overlapping annotations in a text.</p>
|
|
3
|
+
<p>Annotations on a text have a non-hierarchical nature, i.e., they can overlap:</p>
|
|
4
|
+
<pre class="text"><code>Aa<bc>bb<cd>cc</bc>dd<cd>ee.</code></pre>
|
|
5
|
+
<p>However, HTML is hierarchical. How to display these kinds of annotations that do not live inside or next to each other, but cut across each other?</p>
|
|
6
|
+
<p>The <code>segment</code> function creates an array of segments: a flat, non-overlapping list where each segment links to both the text and all the annotations that apply. Each segment translates into a single DOM element. Elements linked to multiple overlapping annotations can now be decorated with their own styling, classes and callbacks.</p>
|
|
7
|
+
<p>A special case is the marker: an annotation of zero width marking a position in the text. Markers result in zero-width segments, also linking all annotations that start at, end at, or span across that position.</p>
|
|
8
|
+
<h2 id="api">API</h2>
|
|
9
|
+
<ul>
|
|
10
|
+
<li><a href="./src/segment.ts"><code>segment<T>(text, annotations): TextSegment<T>[]</code></a> <br /> Split a text into TextSegments with char offsets to the text and a list of applying annotations.</li>
|
|
11
|
+
<li><a href="./src/Model.ts"><code>AnnotationSegment<T></code></a> <br /> Input list of objects linking annotations to the text using character offsets.</li>
|
|
12
|
+
<li><a href="./src/Model.ts"><code>TextSegment<T></code></a> <br /> Output list of segments with character offsets and the annotations that apply.</li>
|
|
13
|
+
<li><a href="./src/groupSegments.ts"><code>groupSegments<T>(segments, predicate): Group<T>[]</code></a> <br /> Group segments into higher-level units (e.g., words, sentences, entities) by collecting all segments that share a matching annotation.</li>
|
|
14
|
+
<li><a href="./src/groupSegments.ts"><code>Group<T></code></a> <br /> Output group of segments matching the same predicate result.</li>
|
|
15
|
+
</ul>
|
|
16
|
+
<h2 id="example">Example</h2>
|
|
17
|
+
<p>Given the text <code>"abc"</code> with two overlapping annotations:</p>
|
|
18
|
+
<div class="sourceCode" id="cb2"><pre class="sourceCode txt"><code class="sourceCode default"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true"></a> text: abc</span>
|
|
19
|
+
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true"></a>annotation ab: __</span>
|
|
20
|
+
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true"></a>annotation bc: __</span></code></pre></div>
|
|
21
|
+
<div class="sourceCode" id="cb3"><pre class="sourceCode ts"><code class="sourceCode typescript"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true"></a><span class="im">import</span> { segment } from <span class="st">"text-annotation-segmenter"</span><span class="op">;</span></span>
|
|
22
|
+
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true"></a></span>
|
|
23
|
+
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true"></a>const text <span class="op">=</span> <span class="st">'abc'</span><span class="op">;</span></span>
|
|
24
|
+
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true"></a>const ab <span class="op">=</span> {id<span class="op">:</span> <span class="st">'ab'</span>}<span class="op">;</span></span>
|
|
25
|
+
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true"></a>const bc <span class="op">=</span> {id<span class="op">:</span> <span class="st">'bc'</span>}<span class="op">;</span></span>
|
|
26
|
+
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true"></a></span>
|
|
27
|
+
<span id="cb3-7"><a href="#cb3-7" aria-hidden="true"></a>const segments <span class="op">=</span> <span class="fu">segment</span>(text<span class="op">,</span> [</span>
|
|
28
|
+
<span id="cb3-8"><a href="#cb3-8" aria-hidden="true"></a> {begin<span class="op">:</span> <span class="dv">0</span><span class="op">,</span> end<span class="op">:</span> <span class="dv">2</span><span class="op">,</span> body<span class="op">:</span> ab}<span class="op">,</span></span>
|
|
29
|
+
<span id="cb3-9"><a href="#cb3-9" aria-hidden="true"></a> {begin<span class="op">:</span> <span class="dv">1</span><span class="op">,</span> end<span class="op">:</span> <span class="dv">3</span><span class="op">,</span> body<span class="op">:</span> bc}<span class="op">,</span></span>
|
|
30
|
+
<span id="cb3-10"><a href="#cb3-10" aria-hidden="true"></a>])<span class="op">;</span></span>
|
|
31
|
+
<span id="cb3-11"><a href="#cb3-11" aria-hidden="true"></a></span>
|
|
32
|
+
<span id="cb3-12"><a href="#cb3-12" aria-hidden="true"></a><span class="fu">expect</span>(segments)<span class="op">.</span><span class="fu">toEqual</span>([</span>
|
|
33
|
+
<span id="cb3-13"><a href="#cb3-13" aria-hidden="true"></a> {id<span class="op">:</span> <span class="st">'0'</span><span class="op">,</span> begin<span class="op">:</span> <span class="dv">0</span><span class="op">,</span> end<span class="op">:</span> <span class="dv">1</span><span class="op">,</span> annotations<span class="op">:</span> [ab]}<span class="op">,</span></span>
|
|
34
|
+
<span id="cb3-14"><a href="#cb3-14" aria-hidden="true"></a> {id<span class="op">:</span> <span class="st">'1'</span><span class="op">,</span> begin<span class="op">:</span> <span class="dv">1</span><span class="op">,</span> end<span class="op">:</span> <span class="dv">2</span><span class="op">,</span> annotations<span class="op">:</span> [ab<span class="op">,</span> bc]}<span class="op">,</span></span>
|
|
35
|
+
<span id="cb3-15"><a href="#cb3-15" aria-hidden="true"></a> {id<span class="op">:</span> <span class="st">'2'</span><span class="op">,</span> begin<span class="op">:</span> <span class="dv">2</span><span class="op">,</span> end<span class="op">:</span> <span class="dv">3</span><span class="op">,</span> annotations<span class="op">:</span> [bc]}<span class="op">,</span></span>
|
|
36
|
+
<span id="cb3-16"><a href="#cb3-16" aria-hidden="true"></a>])<span class="op">;</span></span></code></pre></div>
|
|
37
|
+
<p>More examples:</p>
|
|
38
|
+
<ul>
|
|
39
|
+
<li>For edge cases, see: <a href="./src/segment.spec.ts">segment.spec.ts</a>.</li>
|
|
40
|
+
<li>For benchmarks, see <a href="./src/segment.bench.ts">segment.bench.ts</a>.</li>
|
|
41
|
+
</ul>
|
package/README.md
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# @knaw-huc/text-annotation-segmenter
|
|
2
|
+
|
|
3
|
+
Utility functions to help render overlapping annotations in a text.
|
|
4
|
+
|
|
5
|
+
Annotations on a text have a non-hierarchical nature, i.e., they can overlap:
|
|
6
|
+
```text
|
|
7
|
+
Aa<bc>bb<cd>cc</bc>dd<cd>ee.
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
However, HTML is hierarchical. How to display these kinds of annotations that do not live inside or next to each other, but cut across each other?
|
|
11
|
+
|
|
12
|
+
The `segment` function creates an array of segments: a flat, non-overlapping list where each segment links to both the text and all the annotations that apply. Each segment translates into a single DOM element. Elements linked to multiple overlapping annotations can now be decorated with their own styling, classes and callbacks.
|
|
13
|
+
|
|
14
|
+
A special case is the marker: an annotation of zero width marking a position in the text. Markers result in zero-width segments, also linking all annotations that start at, end at, or span across that position.
|
|
15
|
+
|
|
16
|
+
## API
|
|
17
|
+
|
|
18
|
+
- [`segment<T>(text, annotations): TextSegment<T>[]`](./src/segment.ts) <br /> Split a text into TextSegments with char offsets to the text and a list of applying annotations.
|
|
19
|
+
- [`AnnotationSegment<T>`](./src/Model.ts) <br /> Input list of objects linking annotations to the text using character offsets.
|
|
20
|
+
- [`TextSegment<T>`](./src/Model.ts) <br /> Output list of segments with character offsets and the annotations that apply.
|
|
21
|
+
- [`groupSegments<T>(segments, predicate): Group<T>[]`](./src/groupSegments.ts) <br /> Group segments into higher-level units (e.g., words, sentences, entities) by collecting all segments that share a matching annotation.
|
|
22
|
+
- [`Group<T>`](./src/groupSegments.ts) <br /> Output group of segments matching the same predicate result.
|
|
23
|
+
|
|
24
|
+
## Example
|
|
25
|
+
|
|
26
|
+
Given the text `"abc"` with two overlapping annotations:
|
|
27
|
+
|
|
28
|
+
```txt
|
|
29
|
+
text: abc
|
|
30
|
+
annotation ab: __
|
|
31
|
+
annotation bc: __
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
```ts
|
|
35
|
+
import { segment } from "text-annotation-segmenter";
|
|
36
|
+
|
|
37
|
+
const text = 'abc';
|
|
38
|
+
const ab = {id: 'ab'};
|
|
39
|
+
const bc = {id: 'bc'};
|
|
40
|
+
|
|
41
|
+
const segments = segment(text, [
|
|
42
|
+
{begin: 0, end: 2, body: ab},
|
|
43
|
+
{begin: 1, end: 3, body: bc},
|
|
44
|
+
]);
|
|
45
|
+
|
|
46
|
+
expect(segments).toEqual([
|
|
47
|
+
{id: '0', begin: 0, end: 1, annotations: [ab]},
|
|
48
|
+
{id: '1', begin: 1, end: 2, annotations: [ab, bc]},
|
|
49
|
+
{id: '2', begin: 2, end: 3, annotations: [bc]},
|
|
50
|
+
]);
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
More examples:
|
|
54
|
+
|
|
55
|
+
- For edge cases, see: [segment.spec.ts](./src/segment.spec.ts).
|
|
56
|
+
- For benchmarks, see [segment.bench.ts](./src/segment.bench.ts).
|
package/dist/Model.d.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Input list of objects linking annotations to the text using character offsets.
|
|
3
|
+
*/
|
|
4
|
+
export type AnnotationSegment<T = unknown> = {
|
|
5
|
+
begin: number;
|
|
6
|
+
end: number;
|
|
7
|
+
body: T;
|
|
8
|
+
};
|
|
9
|
+
/**
|
|
10
|
+
* Output list of segments with character offsets and the annotations that apply.
|
|
11
|
+
*/
|
|
12
|
+
export type TextSegment<T = unknown> = {
|
|
13
|
+
id: SegmentId;
|
|
14
|
+
begin: number;
|
|
15
|
+
end: number;
|
|
16
|
+
annotations: T[];
|
|
17
|
+
};
|
|
18
|
+
export type SegmentId = string;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { TextSegment } from './Model.ts';
|
|
2
|
+
export type Group<T> = {
|
|
3
|
+
annotation: T;
|
|
4
|
+
segments: TextSegment<T>[];
|
|
5
|
+
};
|
|
6
|
+
/**
|
|
7
|
+
* Group segments into higher-level units (e.g. words, sentences, entities)
|
|
8
|
+
* by grouping all segments that share a matching annotation.
|
|
9
|
+
*/
|
|
10
|
+
export declare function groupSegments<T>(segments: TextSegment<T>[], predicate: (annotation: T) => boolean): Group<T>[];
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export { segment } from './segment.ts';
|
|
2
|
+
export type { AnnotationSegmentsByChar } from './segment.ts';
|
|
3
|
+
export { groupSegments } from './groupSegments.ts';
|
|
4
|
+
export type { Group } from './groupSegments.ts';
|
|
5
|
+
export type { AnnotationSegment, TextSegment, SegmentId, } from './Model.ts';
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
function segment(text, annotations) {
|
|
2
|
+
const segments = [];
|
|
3
|
+
let segmentCounter = 0;
|
|
4
|
+
const offsetMap = /* @__PURE__ */ new Map(), getOrCreateOffset = (charIndex) => {
|
|
5
|
+
let offset = offsetMap.get(charIndex);
|
|
6
|
+
return offset || (offset = { charIndex, starting: [], ending: [] }, offsetMap.set(charIndex, offset)), offset;
|
|
7
|
+
};
|
|
8
|
+
getOrCreateOffset(0), getOrCreateOffset(text.length);
|
|
9
|
+
for (const annotation of annotations) {
|
|
10
|
+
if (annotation.begin >= text.length || annotation.end <= 0)
|
|
11
|
+
continue;
|
|
12
|
+
const needsClamping = annotation.begin < 0 || annotation.end > text.length;
|
|
13
|
+
let clamped = annotation;
|
|
14
|
+
needsClamping && (clamped = {
|
|
15
|
+
...annotation,
|
|
16
|
+
begin: Math.max(0, annotation.begin),
|
|
17
|
+
end: Math.min(text.length, annotation.end)
|
|
18
|
+
}), getOrCreateOffset(clamped.begin).starting.push(clamped), getOrCreateOffset(clamped.end).ending.push(clamped);
|
|
19
|
+
}
|
|
20
|
+
const sortedOffsets = [...offsetMap.values()].sort(
|
|
21
|
+
(a, b) => a.charIndex - b.charIndex
|
|
22
|
+
), activeAnnotations = /* @__PURE__ */ new Set();
|
|
23
|
+
let lastOffset = 0;
|
|
24
|
+
for (const offset of sortedOffsets) {
|
|
25
|
+
if (offset.charIndex > lastOffset) {
|
|
26
|
+
const id = `${segmentCounter++}`;
|
|
27
|
+
segments.push({
|
|
28
|
+
id,
|
|
29
|
+
begin: lastOffset,
|
|
30
|
+
end: offset.charIndex,
|
|
31
|
+
annotations: [...activeAnnotations]
|
|
32
|
+
});
|
|
33
|
+
}
|
|
34
|
+
lastOffset = offset.charIndex;
|
|
35
|
+
let hasMarkers = !1;
|
|
36
|
+
const markerSegmentAnnotations = [];
|
|
37
|
+
for (const a of offset.starting)
|
|
38
|
+
offset.ending.includes(a) ? (hasMarkers = !0, markerSegmentAnnotations.push(a.body)) : activeAnnotations.add(a.body);
|
|
39
|
+
for (const a of offset.ending)
|
|
40
|
+
activeAnnotations.delete(a.body);
|
|
41
|
+
if (hasMarkers) {
|
|
42
|
+
markerSegmentAnnotations.push(...activeAnnotations);
|
|
43
|
+
const segment2 = `${segmentCounter++}`;
|
|
44
|
+
segments.push({
|
|
45
|
+
id: segment2,
|
|
46
|
+
begin: offset.charIndex,
|
|
47
|
+
end: offset.charIndex,
|
|
48
|
+
annotations: markerSegmentAnnotations
|
|
49
|
+
});
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return segments;
|
|
53
|
+
}
|
|
54
|
+
function groupSegments(segments, predicate) {
|
|
55
|
+
const groups = /* @__PURE__ */ new Map();
|
|
56
|
+
for (const segment2 of segments) {
|
|
57
|
+
const match = segment2.annotations.find(predicate);
|
|
58
|
+
if (match) {
|
|
59
|
+
let list = groups.get(match);
|
|
60
|
+
list || (list = [], groups.set(match, list)), list.push(segment2);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return [...groups.entries()].map(([annotation, segments2]) => ({
|
|
64
|
+
annotation,
|
|
65
|
+
segments: segments2
|
|
66
|
+
}));
|
|
67
|
+
}
|
|
68
|
+
export {
|
|
69
|
+
groupSegments,
|
|
70
|
+
segment
|
|
71
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare function orThrow(msg: string): never;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { AnnotationSegment, TextSegment } from './Model';
|
|
2
|
+
/**
|
|
3
|
+
* Split a text into {@link TextSegment}s with character offsets and a list of applying annotations.
|
|
4
|
+
*/
|
|
5
|
+
export declare function segment<T>(text: string, annotations: AnnotationSegment<T>[]): TextSegment<T>[];
|
|
6
|
+
export type AnnotationSegmentsByChar<T = unknown> = {
|
|
7
|
+
charIndex: number;
|
|
8
|
+
starting: AnnotationSegment<T>[];
|
|
9
|
+
ending: AnnotationSegment<T>[];
|
|
10
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@knaw-huc/text-annotation-segmenter",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"repository": {
|
|
5
|
+
"type": "git",
|
|
6
|
+
"url": "https://github.com/knaw-huc/text-annotation-segmenter.git"
|
|
7
|
+
},
|
|
8
|
+
"type": "module",
|
|
9
|
+
"main": "dist/index.js",
|
|
10
|
+
"types": "dist/index.d.ts",
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"import": "./dist/index.js",
|
|
14
|
+
"types": "./dist/index.d.ts"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"dist"
|
|
19
|
+
],
|
|
20
|
+
"scripts": {
|
|
21
|
+
"dev:build": "vite build --watch",
|
|
22
|
+
"build": "vite build",
|
|
23
|
+
"test": "vitest run",
|
|
24
|
+
"bench": "vitest bench",
|
|
25
|
+
"prepublishOnly": "npm run build"
|
|
26
|
+
},
|
|
27
|
+
"devDependencies": {
|
|
28
|
+
"@types/node": "^25.2.3",
|
|
29
|
+
"typedoc": "^0.28.17",
|
|
30
|
+
"typescript": "^5.9.2",
|
|
31
|
+
"vite": "^7.0.5",
|
|
32
|
+
"vite-plugin-dts": "^4.5.4",
|
|
33
|
+
"vitest": "^4.0.16"
|
|
34
|
+
}
|
|
35
|
+
}
|