@knaw-huc/text-annotation-segmenter 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  [![npm version](https://img.shields.io/npm/v/@knaw-huc/text-annotation-segmenter.svg?color=green)](https://www.npmjs.com/package/@knaw-huc/text-annotation-segmenter)
4
4
 
5
- Utility functions to render annotations with character offsets in a text.
5
+ Utility functions to render annotations with character position offsets in a text.
6
6
 
7
7
  Annotations on a text have a non-hierarchical nature, i.e., they can overlap:
8
8
  ```text
@@ -24,16 +24,16 @@ The `segment` function creates an array of segments: a flat, non-overlapping lis
24
24
 
25
25
  ### Functions
26
26
 
27
- - [`segment<T>(text, annotations, getOffsets, getMarkerPosition?): TextSegment<T>[]`](./src/segment.ts) <br /> Split a text into TextSegments with character offsets to the text and a list of applying annotations.
27
+ - [`segment<T>(text, annotations, getOffsets, getMarkerPosition?): TextSegment<T>[]`](./src/segment.ts) <br /> Split a text into TextSegments with character positions and a list of applying annotations.
28
28
  - [`groupSegments<T>(segments, isGroup, getId): SegmentGroup<T>[]`](src/groupSegments.ts) <br /> Recursively group segments into higher-level units (e.g., words, paragraphs, divs) by collecting all segments that share a matching annotation.
29
29
  - [`collectGroupSegments<T>(group): TextSegment<T>[]`](src/groupSegments.ts) <br /> Recursively collect all TextSegments from a Group.
30
- - [`findSegmentOffsets<T>(segments): Map<T, SegmentOffsets>`](src/findSegmentOffsets.ts) <br /> For each annotation, collect the first and last segment index it appears in, keyed by object reference.
30
+ - [`findSegmentRange<T>(segments): Map<T, SegmentRange>`](src/findSegmentRange.ts) <br /> For each annotation, collect the first and last segment index it appears in, keyed by object reference.
31
31
 
32
32
  ### Types
33
33
 
34
34
  - [`SegmentGroup<T>`](src/groupSegments.ts) <br /> A `Group<T>` (with annotation and recursive children) or `Ungrouped<T>` (with plain segments).
35
- - [`SegmentOffsets`](src/findSegmentOffsets.ts) <br /> Start and end segment index of an annotation (excluding end segment).
36
- - [`TextSegment<T>`](./src/Model.ts) <br /> Start and end offsets of text segment (excluding last character), plus the annotations that apply.
35
+ - [`SegmentRange`](src/findSegmentRange.ts) <br /> Start and end segment index of an annotation (excluding last segment).
36
+ - [`TextSegment<T>`](./src/Model.ts) <br /> Start and end positions of text segment (excluding last character), plus the annotations that apply.
37
37
 
38
38
  ## Examples
39
39
 
@@ -51,17 +51,17 @@ annotation bc: __
51
51
  import {segment} from "@knaw-huc/text-annotation-segmenter";
52
52
 
53
53
  const text = 'abc';
54
- const ab = {id: 'ab', begin: 0, end: 2};
55
- const bc = {id: 'bc', begin: 1, end: 3};
54
+ const ab = {id: 'ab', start: 0, end: 2};
55
+ const bc = {id: 'bc', start: 1, end: 3};
56
56
 
57
57
  const getOffsets = annotation => annotation;
58
58
 
59
59
  const segments = segment(text, [ab, bc], getOffsets);
60
60
 
61
61
  expect(segments).toEqual([
62
- {index: 0, begin: 0, end: 1, body: 'a', annotations: [ab]},
63
- {index: 1, begin: 1, end: 2, body: 'b', annotations: [ab, bc]},
64
- {index: 2, begin: 2, end: 3, body: 'c', annotations: [bc]},
62
+ {index: 0, start: 0, end: 1, value: 'a', annotations: [ab]},
63
+ {index: 1, start: 1, end: 2, value: 'b', annotations: [ab, bc]},
64
+ {index: 2, start: 2, end: 3, value: 'c', annotations: [bc]},
65
65
  ]);
66
66
  ```
67
67
  - More examples: [segment.spec.ts](./src/segment.spec.ts).
@@ -83,8 +83,8 @@ paragraph: __
83
83
  import {segment, groupSegments} from "@knaw-huc/text-annotation-segmenter";
84
84
 
85
85
  const text = 'aabb';
86
- const section = {id: 'section', type: 'section', begin: 0, end: 4};
87
- const paragraph = {id: 'paragraph', type: 'paragraph', begin: 0, end: 2};
86
+ const section = {id: 'section', type: 'section', start: 0, end: 4};
87
+ const paragraph = {id: 'paragraph', type: 'paragraph', start: 0, end: 2};
88
88
 
89
89
  const getOffsets = annotation => annotation;
90
90
  const segments = segment(text, [section, paragraph], getOffsets);
@@ -125,19 +125,19 @@ annotation b: _
125
125
 
126
126
  ```ts
127
127
  const text = 'ab';
128
- const marker: Annotation = {begin: 1, end: 1, id: 'm'};
129
- const a: Annotation = {begin: 0, end: 1, id: 'a'};
130
- const b: Annotation = {begin: 1, end: 2, id: 'b'};
128
+ const marker: Annotation = {start: 1, end: 1, id: 'm'};
129
+ const a: Annotation = {start: 0, end: 1, id: 'a'};
130
+ const b: Annotation = {start: 1, end: 2, id: 'b'};
131
131
 
132
132
  const segments = segment(text, [marker, a, b], getOffsets);
133
133
  const markerSegment = segments[1];
134
134
  expect(markerSegment).toEqual(
135
- {index: 1, begin: 1, end: 1, body: '', annotations: [marker, a]},
135
+ {index: 1, start: 1, end: 1, value: '', annotations: [marker, a]},
136
136
  );
137
137
  ```
138
- Notice how the marker is 'postfixed' to the previous annotation. When the marker shares its character index with the start and end offsets of annotations, by default the marker segment will only include the annotations that end at that offset (e.g: a note at the end of a paragraph).
138
+ Notice how the marker is 'postfixed' to the previous annotation. When the marker shares its character index with the start and end positions of annotations, by default the marker segment will only include the annotations that end at that position (e.g: a note at the end of a paragraph).
139
139
 
140
- However, markers can also be prefixed to annotations that share their start offset with the marker, using the `getMarkerPosition` parameter of the `segment` function:
140
+ However, markers can also be prefixed to annotations that share their start position with the marker, using the `getMarkerPosition` parameter of the `segment` function:
141
141
 
142
142
  ```ts
143
143
  const getMarkerPosition = a => a.id === 'm' ? 'prefix' : 'postfix'
@@ -145,7 +145,7 @@ const segments = segment(text, [marker, a, b], getOffsets, getMarkerPosition);
145
145
 
146
146
  const markerSegment = segments[1];
147
147
  expect(markerSegment).toEqual(
148
- {index: 1, begin: 1, end: 1, body: '', annotations: [marker, b]},
148
+ {index: 1, start: 1, end: 1, value: '', annotations: [marker, b]},
149
149
  );
150
150
  ```
151
151
  More examples: [segment.spec.ts](./src/segment.spec.ts#L118).
@@ -1,2 +1,2 @@
1
- import { Offsets } from './Model.ts';
2
- export type GetOffsets<T> = (annotation: T) => Offsets;
1
+ import { TextPosition } from './Model.ts';
2
+ export type GetOffsets<T> = (annotation: T) => TextPosition;
package/dist/Model.d.ts CHANGED
@@ -1,17 +1,24 @@
1
1
  /**
2
2
  * Start and end character, end excluding last character
3
3
  */
4
- export type Offsets = {
5
- begin: number;
4
+ export type TextPosition = {
5
+ start: number;
6
6
  end: number;
7
7
  };
8
8
  /**
9
- * Output: start and end offsets of text segment (excluding last character), plus the annotations that apply.
9
+ * Output: the character range and sliced text of a text segment, plus all annotations that apply.
10
10
  */
11
- export type TextSegment<T> = Offsets & {
11
+ export type TextSegment<T> = TextPosition & {
12
12
  index: SegmentIndex;
13
- body: string;
13
+ value: string;
14
14
  annotations: T[];
15
15
  };
16
16
  export type SegmentIndex = number;
17
17
  export type MarkerPosition = 'prefix' | 'postfix';
18
+ /**
19
+ * Start and end segment index of an annotation, excluding last segment
20
+ */
21
+ export type SegmentRange = {
22
+ startSegment: number;
23
+ endSegment: number;
24
+ };
@@ -0,0 +1,6 @@
1
+ import { SegmentRange, TextSegment } from './Model';
2
+ /**
3
+ * For each annotation, collect the first and last segment index it appears in,
4
+ * keyed by object reference.
5
+ */
6
+ export declare function findSegmentRange<T>(segments: TextSegment<T>[]): Map<T, SegmentRange>;
package/dist/index.d.ts CHANGED
@@ -1,10 +1,9 @@
1
1
  export { segment } from './segment.ts';
2
2
  export type { AnnotationSegmentsByChar } from './segment.ts';
3
- export { findSegmentOffsets } from './findSegmentOffsets.ts';
4
- export type { SegmentOffsets } from './findSegmentOffsets.ts';
3
+ export { findSegmentRange } from './findSegmentRange.ts';
5
4
  export { groupSegments } from './groupSegments.ts';
6
5
  export { collectGroupSegments } from './collectGroupSegments.ts';
7
6
  export type { Group, SegmentGroup } from './groupSegments.ts';
8
- export type { Offsets, TextSegment, SegmentIndex, MarkerPosition, } from './Model.ts';
7
+ export type { TextPosition, TextSegment, SegmentIndex, MarkerPosition, SegmentRange } from './Model.ts';
9
8
  export type { GetOffsets } from './GetOffsets.ts';
10
9
  export type { GetMarkerPosition } from './GetMarkerPosition.ts';
package/dist/index.js CHANGED
@@ -6,17 +6,17 @@ function segment(text, annotations, getOffsets, getMarkerPosition = () => "postf
6
6
  return offset || (offset = { charIndex, starting: [], ending: [] }, offsetMap.set(charIndex, offset)), offset;
7
7
  }, createMarkerSegment = (markers, charIndex) => ({
8
8
  index: ++segmentCounter,
9
- begin: charIndex,
9
+ start: charIndex,
10
10
  end: charIndex,
11
- body: "",
11
+ value: "",
12
12
  annotations: [...markers, ...activeAnnotations]
13
13
  });
14
14
  getOrCreateOffset(0), getOrCreateOffset(text.length);
15
15
  for (const annotation of annotations) {
16
- const offsets = getOffsets(annotation), isMarker = offsets.begin === offsets.end;
17
- if (isMarker && (offsets.end < 0 || offsets.begin > text.length) || !isMarker && (offsets.end <= 0 || offsets.begin >= text.length))
16
+ const offsets = getOffsets(annotation), isMarker = offsets.start === offsets.end;
17
+ if (isMarker && (offsets.end < 0 || offsets.start > text.length) || !isMarker && (offsets.end <= 0 || offsets.start >= text.length))
18
18
  continue;
19
- const begin = Math.max(0, offsets.begin), end = Math.min(text.length, offsets.end);
19
+ const begin = Math.max(0, offsets.start), end = Math.min(text.length, offsets.end);
20
20
  getOrCreateOffset(begin).starting.push(annotation), getOrCreateOffset(end).ending.push(annotation);
21
21
  }
22
22
  const sortedOffsets = [...offsetMap.values()].sort(
@@ -28,9 +28,9 @@ function segment(text, annotations, getOffsets, getMarkerPosition = () => "postf
28
28
  const index = ++segmentCounter;
29
29
  segments.push({
30
30
  index,
31
- begin: lastOffset,
31
+ start: lastOffset,
32
32
  end: offset.charIndex,
33
- body: text.slice(lastOffset, offset.charIndex),
33
+ value: text.slice(lastOffset, offset.charIndex),
34
34
  annotations: [...activeAnnotations]
35
35
  });
36
36
  }
@@ -47,12 +47,12 @@ function segment(text, annotations, getOffsets, getMarkerPosition = () => "postf
47
47
  }
48
48
  return segments;
49
49
  }
50
- function findSegmentOffsets(segments) {
50
+ function findSegmentRange(segments) {
51
51
  const offsets = /* @__PURE__ */ new Map();
52
52
  for (let i = 0; i < segments.length; i++)
53
53
  for (const annotation of segments[i].annotations) {
54
54
  const existing = offsets.get(annotation);
55
- existing ? existing.endSegment = i + 1 : offsets.set(annotation, { beginSegment: i, endSegment: i + 1 });
55
+ existing ? existing.endSegment = i + 1 : offsets.set(annotation, { startSegment: i, endSegment: i + 1 });
56
56
  }
57
57
  return offsets;
58
58
  }
@@ -102,7 +102,7 @@ function collectGroupSegments(group2) {
102
102
  }
103
103
  export {
104
104
  collectGroupSegments,
105
- findSegmentOffsets,
105
+ findSegmentRange,
106
106
  groupSegments,
107
107
  segment
108
108
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@knaw-huc/text-annotation-segmenter",
3
- "version": "0.5.1",
3
+ "version": "0.6.0",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "https://github.com/knaw-huc/text-annotation-segmenter.git"
@@ -1,13 +0,0 @@
1
- import { TextSegment } from './Model';
2
- export type SegmentOffsets = {
3
- beginSegment: number;
4
- /**
5
- * Excluding last segment
6
- */
7
- endSegment: number;
8
- };
9
- /**
10
- * For each annotation, collect the first and last segment index
11
- * it appears in, keyed by object reference.
12
- */
13
- export declare function findSegmentOffsets<T>(segments: TextSegment<T>[]): Map<T, SegmentOffsets>;