behave-text 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- behave_text/__init__.py +0 -0
- behave_text/spec/__init__.py +43 -0
- behave_text/spec/envelope.py +53 -0
- behave_text/spec/primitives.py +353 -0
- behave_text-0.1.0.dist-info/METADATA +14 -0
- behave_text-0.1.0.dist-info/RECORD +8 -0
- behave_text-0.1.0.dist-info/WHEEL +5 -0
- behave_text-0.1.0.dist-info/top_level.txt +1 -0
behave_text/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
2
|
+
"""BEHAVE-TEXT spec — text/messaging-domain registry, layered on behave-core.
|
|
3
|
+
|
|
4
|
+
Public API:
|
|
5
|
+
|
|
6
|
+
from spec import Observation, Window, OBSERVATION_SCHEMA_VERSION
|
|
7
|
+
from spec import PRIMITIVE_REGISTRY, ValueKind, ValueTypeSpec
|
|
8
|
+
from spec import TOPIC_PREFIX, event_topic_for
|
|
9
|
+
|
|
10
|
+
The ``Observation`` exported here is a registry-aware subclass of the base
|
|
11
|
+
class from ``behave-core``; it validates that ``primitive`` is in the
|
|
12
|
+
text registry and that ``value`` matches the registry's per-primitive spec.
|
|
13
|
+
|
|
14
|
+
See ``spec.envelope`` (and the core envelope module) for PII discipline.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .envelope import OBSERVATION_SCHEMA_VERSION, Observation, ObservationValue, Window
|
|
18
|
+
from .primitives import PRIMITIVE_REGISTRY, ValueKind, ValueTypeSpec, get, is_known
|
|
19
|
+
|
|
20
|
+
# Topic namespace deliberately uses *actor* (not *attacker*) because chat-group
|
|
21
|
+
# members may include observers, brokers, victims, and bystanders alongside
|
|
22
|
+
# threat actors. Attribution of role is the engine's job, not BEHAVE-TEXT's.
|
|
23
|
+
TOPIC_PREFIX: str = "actor.observation.text"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def event_topic_for(primitive: str) -> str:
|
|
27
|
+
"""Return the canonical bus topic for a BEHAVE-TEXT primitive."""
|
|
28
|
+
return f"{TOPIC_PREFIX}.{primitive}"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"OBSERVATION_SCHEMA_VERSION",
|
|
33
|
+
"Observation",
|
|
34
|
+
"ObservationValue",
|
|
35
|
+
"Window",
|
|
36
|
+
"PRIMITIVE_REGISTRY",
|
|
37
|
+
"ValueKind",
|
|
38
|
+
"ValueTypeSpec",
|
|
39
|
+
"is_known",
|
|
40
|
+
"get",
|
|
41
|
+
"TOPIC_PREFIX",
|
|
42
|
+
"event_topic_for",
|
|
43
|
+
]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
2
|
+
"""BEHAVE-TEXT Observation envelope (registry-aware subclass).
|
|
3
|
+
|
|
4
|
+
Mirrors BEHAVE-SHELL's pattern: structural envelope from `behave-core`,
|
|
5
|
+
registry-aware validation added here against BEHAVE-TEXT's `PRIMITIVE_REGISTRY`.
|
|
6
|
+
|
|
7
|
+
PII discipline (TIGHTER for text than for shell):
|
|
8
|
+
text-domain sensors operate on raw message bodies. They MUST hash, aggregate,
|
|
9
|
+
or categorize before constructing an Observation — never put message text
|
|
10
|
+
into the `value` or `evidence_ref` field. `evidence_ref` should point at an
|
|
11
|
+
external message-store record (e.g. a Telegram message ID), not at the text.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from pydantic import model_validator
|
|
17
|
+
|
|
18
|
+
from behave_core.spec.envelope import (
|
|
19
|
+
OBSERVATION_SCHEMA_VERSION,
|
|
20
|
+
ObservationValue,
|
|
21
|
+
Window,
|
|
22
|
+
)
|
|
23
|
+
from behave_core.spec.envelope import Observation as _BaseObservation
|
|
24
|
+
|
|
25
|
+
from .primitives import PRIMITIVE_REGISTRY
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Observation(_BaseObservation):
|
|
29
|
+
"""Text-domain Observation: base envelope + BEHAVE-TEXT registry check."""
|
|
30
|
+
|
|
31
|
+
@model_validator(mode="after")
|
|
32
|
+
def _validate_against_text_registry(self) -> "Observation":
|
|
33
|
+
spec = PRIMITIVE_REGISTRY.get(self.primitive)
|
|
34
|
+
if spec is None:
|
|
35
|
+
raise ValueError(
|
|
36
|
+
f"unknown primitive {self.primitive!r}; "
|
|
37
|
+
f"add it to spec/primitives.py:PRIMITIVE_REGISTRY first"
|
|
38
|
+
)
|
|
39
|
+
try:
|
|
40
|
+
spec.validate_value(self.value)
|
|
41
|
+
except ValueError as exc:
|
|
42
|
+
raise ValueError(
|
|
43
|
+
f"value invalid for primitive {self.primitive!r}: {exc}"
|
|
44
|
+
) from None
|
|
45
|
+
return self
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
"OBSERVATION_SCHEMA_VERSION",
|
|
50
|
+
"Observation",
|
|
51
|
+
"ObservationValue",
|
|
52
|
+
"Window",
|
|
53
|
+
]
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
2
|
+
"""BEHAVE-TEXT primitive registry.
|
|
3
|
+
|
|
4
|
+
Source-of-truth for what `Observation.primitive` may be in the text/messaging
|
|
5
|
+
domain and what `Observation.value` must look like. Mirrors every row in the
|
|
6
|
+
primitive tables of `scratchpad.md`.
|
|
7
|
+
|
|
8
|
+
PII discipline notice (carried over from behave-core's envelope module):
|
|
9
|
+
TEXT-domain observations carry CATEGORICAL LABELS, AGGREGATE RATES, and
|
|
10
|
+
HASHES of distributions. Sensors operating on Telegram/messaging text MUST
|
|
11
|
+
NOT emit raw message content into BEHAVE-TEXT observations — only derived
|
|
12
|
+
features. The `evidence_ref` field points to the underlying message store
|
|
13
|
+
held elsewhere; never into the message body itself.
|
|
14
|
+
|
|
15
|
+
This is a tighter constraint than BEHAVE-SHELL's because the source signal
|
|
16
|
+
IS text content. Sensors must hash/aggregate before emitting.
|
|
17
|
+
|
|
18
|
+
Adding a new primitive is a deliberate registry edit. Drift between this file
|
|
19
|
+
and `scratchpad.md` is a bug; v0 keeps the registry hand-written so PR review
|
|
20
|
+
catches drift, v0.x may auto-extract from the markdown if drift becomes a
|
|
21
|
+
maintenance issue.
|
|
22
|
+
|
|
23
|
+
Status flags appear in the `notes` field. `EXPERIMENTAL` marks primitives in
|
|
24
|
+
the `content.*` layer whose detector implementations are likely brittle; an
|
|
25
|
+
attribution engine may choose to weight those at zero until field-validated.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
from enum import Enum
|
|
31
|
+
from typing import Any, Optional
|
|
32
|
+
|
|
33
|
+
from pydantic import BaseModel, Field
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ValueKind(str, Enum):
|
|
37
|
+
"""Discriminator for the shape an `Observation.value` must take."""
|
|
38
|
+
|
|
39
|
+
CATEGORICAL = "categorical"
|
|
40
|
+
NUMERIC = "numeric"
|
|
41
|
+
HASH = "hash"
|
|
42
|
+
ARRAY = "array"
|
|
43
|
+
FREE_STRING = "free_string"
|
|
44
|
+
BOOL = "bool"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ValueTypeSpec(BaseModel):
|
|
48
|
+
"""Per-primitive value-type spec (mirrors BEHAVE-SHELL's shape)."""
|
|
49
|
+
|
|
50
|
+
kind: ValueKind
|
|
51
|
+
allowed: Optional[list[str]] = Field(default=None)
|
|
52
|
+
min_val: Optional[float] = Field(default=None)
|
|
53
|
+
max_val: Optional[float] = Field(default=None)
|
|
54
|
+
array_of: Optional[ValueKind] = Field(default=None)
|
|
55
|
+
notes: Optional[str] = Field(default=None)
|
|
56
|
+
|
|
57
|
+
def validate_value(self, value: Any) -> None:
|
|
58
|
+
if self.kind is ValueKind.CATEGORICAL:
|
|
59
|
+
if not isinstance(value, str):
|
|
60
|
+
raise ValueError(f"expected categorical string, got {type(value).__name__}")
|
|
61
|
+
if self.allowed is not None and value not in self.allowed:
|
|
62
|
+
raise ValueError(f"value {value!r} not in allowed set {self.allowed!r}")
|
|
63
|
+
elif self.kind is ValueKind.NUMERIC:
|
|
64
|
+
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
65
|
+
raise ValueError(f"expected numeric, got {type(value).__name__}")
|
|
66
|
+
if self.min_val is not None and value < self.min_val:
|
|
67
|
+
raise ValueError(f"value {value} below min_val {self.min_val}")
|
|
68
|
+
if self.max_val is not None and value > self.max_val:
|
|
69
|
+
raise ValueError(f"value {value} above max_val {self.max_val}")
|
|
70
|
+
elif self.kind is ValueKind.HASH:
|
|
71
|
+
if not isinstance(value, str) or not value:
|
|
72
|
+
raise ValueError("expected non-empty hash string")
|
|
73
|
+
elif self.kind is ValueKind.FREE_STRING:
|
|
74
|
+
if not isinstance(value, str):
|
|
75
|
+
raise ValueError(f"expected string, got {type(value).__name__}")
|
|
76
|
+
elif self.kind is ValueKind.BOOL:
|
|
77
|
+
if not isinstance(value, bool):
|
|
78
|
+
raise ValueError(f"expected bool, got {type(value).__name__}")
|
|
79
|
+
elif self.kind is ValueKind.ARRAY:
|
|
80
|
+
if not isinstance(value, list):
|
|
81
|
+
raise ValueError(f"expected array, got {type(value).__name__}")
|
|
82
|
+
if self.array_of is None:
|
|
83
|
+
return
|
|
84
|
+
element_spec = ValueTypeSpec(kind=self.array_of)
|
|
85
|
+
for i, element in enumerate(value):
|
|
86
|
+
try:
|
|
87
|
+
element_spec.validate_value(element)
|
|
88
|
+
except ValueError as exc:
|
|
89
|
+
raise ValueError(f"array element [{i}]: {exc}") from None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ─── Convenience constructors ───────────────────────────────────────────────
|
|
93
|
+
|
|
94
|
+
def _cat(*allowed: str, notes: Optional[str] = None) -> ValueTypeSpec:
|
|
95
|
+
return ValueTypeSpec(kind=ValueKind.CATEGORICAL, allowed=list(allowed), notes=notes)
|
|
96
|
+
|
|
97
|
+
def _num(min_val: Optional[float] = None, max_val: Optional[float] = None, notes: Optional[str] = None) -> ValueTypeSpec:
|
|
98
|
+
return ValueTypeSpec(kind=ValueKind.NUMERIC, min_val=min_val, max_val=max_val, notes=notes)
|
|
99
|
+
|
|
100
|
+
def _hash(notes: Optional[str] = None) -> ValueTypeSpec:
|
|
101
|
+
return ValueTypeSpec(kind=ValueKind.HASH, notes=notes)
|
|
102
|
+
|
|
103
|
+
def _str(notes: Optional[str] = None) -> ValueTypeSpec:
|
|
104
|
+
return ValueTypeSpec(kind=ValueKind.FREE_STRING, notes=notes)
|
|
105
|
+
|
|
106
|
+
def _array(of: ValueKind, notes: Optional[str] = None) -> ValueTypeSpec:
|
|
107
|
+
return ValueTypeSpec(kind=ValueKind.ARRAY, array_of=of, notes=notes)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# ─── The registry ───────────────────────────────────────────────────────────
|
|
111
|
+
#
|
|
112
|
+
# 28 primitives across 4 layers. Mirrors scratchpad.md row-for-row.
|
|
113
|
+
|
|
114
|
+
PRIMITIVE_REGISTRY: dict[str, ValueTypeSpec] = {
|
|
115
|
+
# ── stylometric.* (motor analog — 8) ──────────────────────────────────
|
|
116
|
+
"stylometric.punctuation_style": _hash(notes="canonical punctuation-pattern fingerprint"),
|
|
117
|
+
"stylometric.capitalization_habit": _cat(
|
|
118
|
+
"lowercase", "proper", "random_caps", "mixed_i",
|
|
119
|
+
notes="Dominant capitalization rule the author applies. lowercase=no capitals except "
|
|
120
|
+
"after sentence breaks. proper=standard title/sentence case. random_caps=no "
|
|
121
|
+
"consistent rule. mixed_i=author consistently writes 'i' in lowercase even "
|
|
122
|
+
"mid-sentence — common in Spanish chat where 'I' is not a standalone word "
|
|
123
|
+
"but the habit transfers from the native language's lowercase 'yo'.",
|
|
124
|
+
),
|
|
125
|
+
"stylometric.emoji_usage": _cat(
|
|
126
|
+
"none", "occasional", "frequent", "exclusive",
|
|
127
|
+
notes="Rate of emoji use per message. exclusive=messages rarely contain text without "
|
|
128
|
+
"emoji. This captures tone and register — heavy emoji use in a criminal-market "
|
|
129
|
+
"context is a distinct style trait worth preserving.",
|
|
130
|
+
),
|
|
131
|
+
"stylometric.emoji_placement": _cat(
|
|
132
|
+
"pre_punctuation", "post_punctuation", "no_punctuation", "mixed",
|
|
133
|
+
notes="Where emojis appear relative to sentence-ending punctuation. "
|
|
134
|
+
"pre_punctuation='Hola 😊.' post_punctuation='Hola. 😊' "
|
|
135
|
+
"Individual authors are strikingly consistent in this micro-habit.",
|
|
136
|
+
),
|
|
137
|
+
"stylometric.message_length_class": _cat(
|
|
138
|
+
"short", "medium", "long", "paragraph",
|
|
139
|
+
notes="Median message length bucket: short=1-5 words, medium=6-20 words, "
|
|
140
|
+
"long=21-50 words, paragraph=>50 words. See also "
|
|
141
|
+
"stylometric.message_length_variance_class for the distribution shape.",
|
|
142
|
+
),
|
|
143
|
+
"stylometric.message_length_variance_class": _cat(
|
|
144
|
+
"tight", "varied", "bimodal",
|
|
145
|
+
notes="Coefficient of variation of per-message word counts. Captures "
|
|
146
|
+
"DISTRIBUTION SHAPE that message_length_class collapses by "
|
|
147
|
+
"emitting only the median bucket. Two authors can share the same "
|
|
148
|
+
"median length but have wildly different variance: `tight` (CV<0.5) "
|
|
149
|
+
"= consistent (always 1-3 words), `varied` (0.5<=CV<1.5) = normal "
|
|
150
|
+
"mix, `bimodal` (CV>=1.5) = long-tail (mostly short with occasional "
|
|
151
|
+
"rants). Added in v0.2 after Rutify calibration found median-only "
|
|
152
|
+
"bucketing discarded most of the per-author variance signal.",
|
|
153
|
+
),
|
|
154
|
+
"stylometric.linebreak_style": _cat(
|
|
155
|
+
"single_thought", "multi_line", "wall_of_text",
|
|
156
|
+
notes="Whether the author sends one complete thought per message or breaks a single "
|
|
157
|
+
"statement into multiple sequential short messages. multi_line=habitual "
|
|
158
|
+
"message-burst style (sends 3-5 short messages in rapid succession instead "
|
|
159
|
+
"of one composed message). wall_of_text=rarely uses line breaks, sends dense "
|
|
160
|
+
"blocks. Captures a stylistic rhythm that is hard to consciously alter.",
|
|
161
|
+
),
|
|
162
|
+
"stylometric.typo_signature": _hash(notes="sha256 of canonical persistent-typo set"),
|
|
163
|
+
"stylometric.function_word_distribution_top50": _hash(
|
|
164
|
+
notes="64-bit simhash over the 50-most-common Spanish function-word frequency "
|
|
165
|
+
"vector. Mosteller-Wallace gold standard for English long-form authorship; "
|
|
166
|
+
"EMPIRICALLY DOMAIN-FLAWED for Spanish chat-domain — calibrated 2026-05-02 "
|
|
167
|
+
"against the Rutify corpus showed within-author and cross-author Hamming "
|
|
168
|
+
"distance distributions overlap (within median 8 bits, cross median 10 "
|
|
169
|
+
"bits) so this primitive ALONE cannot discriminate authors in chat-style "
|
|
170
|
+
"short-message corpora. Engines should weight it low until paired with "
|
|
171
|
+
"the larger top-200 variant or composited with character n-gram and "
|
|
172
|
+
"distinctive-vocabulary signatures (see siblings below). Kept in v0 for "
|
|
173
|
+
"calibration grids and documentary purposes.",
|
|
174
|
+
),
|
|
175
|
+
"stylometric.function_word_distribution_top200": _hash(
|
|
176
|
+
notes="64-bit simhash over the 200-most-common Spanish function-word frequency "
|
|
177
|
+
"vector. The wider list reaches into the long tail (rare-but-individual "
|
|
178
|
+
"function words like `tampoco`, `aunque`, `mientras`) that carry more "
|
|
179
|
+
"discriminating signal in short-message chat domains. NOT YET EMITTED by "
|
|
180
|
+
"the v0 prototype extractor; populated when v0.2 calibration is done.",
|
|
181
|
+
),
|
|
182
|
+
"stylometric.character_ngram_simhash": _hash(
|
|
183
|
+
notes="64-bit simhash over a frequency vector of character n-grams (default "
|
|
184
|
+
"n=3) from the author's lowercased text corpus. ORTHOGONAL to "
|
|
185
|
+
"function-word distributions: captures punctuation tics, accent-"
|
|
186
|
+
"stripping habits, typo patterns, and idiom-fragment fingerprints "
|
|
187
|
+
"that survive paraphrase. Lowercases input so that capitalization "
|
|
188
|
+
"habits — already captured by stylometric.capitalization_habit — "
|
|
189
|
+
"do not double-count. Accents PRESERVED because accent-stripping is "
|
|
190
|
+
"itself a stylistic tic worth catching. Source label declares n size "
|
|
191
|
+
"(e.g. `#char3gram`, `#char4gram`).",
|
|
192
|
+
),
|
|
193
|
+
"stylometric.distinctive_vocabulary_signature": _hash(
|
|
194
|
+
notes="64-bit simhash over a TF-IDF-weighted top-K rare-word vector. "
|
|
195
|
+
"COMPLEMENTARY to function-word distributions: where function_word_* "
|
|
196
|
+
"captures common-word *style*, this captures the author's distinctive "
|
|
197
|
+
"*lexicon* (the words this person uses that other authors in the same "
|
|
198
|
+
"corpus do NOT). Strong against context-shift because rare words are "
|
|
199
|
+
"where authorial choice lives. Requires the chat corpus for IDF "
|
|
200
|
+
"computation, performed once per extraction. Source label declares the "
|
|
201
|
+
"top-K size and corpus tag (e.g. `#tfidf-top50`).",
|
|
202
|
+
),
|
|
203
|
+
|
|
204
|
+
# ── lexical.* (cognitive analog — 8) ──────────────────────────────────
|
|
205
|
+
"lexical.vocabulary_richness": _num(
|
|
206
|
+
min_val=0.0, max_val=1.0,
|
|
207
|
+
notes="Moving-Average Type-Token Ratio (MATTR) over a sliding window "
|
|
208
|
+
"(default 50 tokens). Volume-independent: each window contributes "
|
|
209
|
+
"its own unique/total ratio, the primitive's value is the mean. "
|
|
210
|
+
"Avoids the standard TTR bias where larger corpora mechanically "
|
|
211
|
+
"score lower. Source label declares the window size.",
|
|
212
|
+
),
|
|
213
|
+
"lexical.slang_density": _num(min_val=0.0, max_val=1.0,
|
|
214
|
+
notes="rate per message; locale-tuned slang corpus"),
|
|
215
|
+
"lexical.code_switching_rate": _num(min_val=0.0, max_val=1.0,
|
|
216
|
+
notes="switches per N tokens; Solorio & Liu metric"),
|
|
217
|
+
"lexical.code_switching_matrix_language": _str(notes="BCP-47 of dominant language"),
|
|
218
|
+
"lexical.code_switching_embedded_languages": _array(ValueKind.FREE_STRING,
|
|
219
|
+
notes="BCP-47 list of non-matrix languages observed"),
|
|
220
|
+
"lexical.sentence_complexity_class": _cat(
|
|
221
|
+
"simple", "compound", "complex",
|
|
222
|
+
notes="Dominant clause structure. simple=single-clause messages (no conjunctions "
|
|
223
|
+
"or subordination). compound=two independent clauses joined by coordinating "
|
|
224
|
+
"conjunctions (pero, y, o, ni). complex=dependent clauses and subordination "
|
|
225
|
+
"(aunque, porque, cuando, que + verb). Reflects education level and "
|
|
226
|
+
"cognitive investment in message composition.",
|
|
227
|
+
),
|
|
228
|
+
"lexical.question_formation_style": _cat(
|
|
229
|
+
"punctuation_only", "lexical", "formal",
|
|
230
|
+
notes="How questions are formed. punctuation_only=question mark appended without "
|
|
231
|
+
"interrogative words ('¿Cuánto?' or 'Mañana?') — very common in Spanish "
|
|
232
|
+
"chat. lexical=explicit interrogatives (¿qué, cómo, cuándo, dónde). "
|
|
233
|
+
"formal=inverted subject-verb order or formal register ('¿Podría usted...'). "
|
|
234
|
+
"Captures register and education level.",
|
|
235
|
+
),
|
|
236
|
+
"lexical.imperative_style": _cat(
|
|
237
|
+
"informal_directive", "formal_directive", "polite",
|
|
238
|
+
notes="How commands and requests are framed. informal_directive=tú/vos imperative "
|
|
239
|
+
"('dame', 'hazlo', 'mándame'). formal_directive=usted imperative "
|
|
240
|
+
"('hágame el favor', 'envíeme'). polite=conditional or modal softening "
|
|
241
|
+
"('¿podría...?', 'me gustaría...'). Stable per-author trait in criminal "
|
|
242
|
+
"market contexts where hierarchical and peer relationships are expressed "
|
|
243
|
+
"through register choice.",
|
|
244
|
+
),
|
|
245
|
+
|
|
246
|
+
# ── temporal_evolution.* (lifecycle / change-over-time — 1) ───────────
|
|
247
|
+
"temporal_evolution.lifecycle_phase": _cat(
|
|
248
|
+
"arrival_burst", "stable_member", "fluctuating_member",
|
|
249
|
+
"inflection_member", "declining_member", "unknown",
|
|
250
|
+
notes="Auto-classified lifecycle stage derived from windowed within-"
|
|
251
|
+
"corpus analysis. arrival_burst: tenure < 24hr with first-window "
|
|
252
|
+
"volume dominating later windows and high inter-window drift "
|
|
253
|
+
"(empirically validated 2026-05-03 against OxPayload's first 12 "
|
|
254
|
+
"hours on Rutify). stable_member: low drift between consecutive "
|
|
255
|
+
"windows across the whole tenure. fluctuating_member (added v0.3): "
|
|
256
|
+
"tenure ≥ 24hr with median drift in [stable_max, inflection_min) "
|
|
257
|
+
"and no single window crossing inflection_min — established noisy "
|
|
258
|
+
"regulars who don't fit clean stable/inflection classes (e.g. "
|
|
259
|
+
"labelled admin lamarabitch, formerly classified unknown). "
|
|
260
|
+
"inflection_member: long-tenure actor whose drift spikes in at "
|
|
261
|
+
"least one window-pair (a real behavioral shift mid-corpus). "
|
|
262
|
+
"declining_member: monotonically decreasing per-window message "
|
|
263
|
+
"counts. unknown: insufficient windowed data for classification. "
|
|
264
|
+
"Window size adapts to tenure: <24hr → 2h windows, <7d → 12h, "
|
|
265
|
+
"<30d → 1d, otherwise 7d.",
|
|
266
|
+
),
|
|
267
|
+
|
|
268
|
+
# ── network.* (governance/role-shape signals — 2, added v0.3) ─────────
|
|
269
|
+
"network.is_likely_bot": _cat(
|
|
270
|
+
"likely_bot", "not_bot", "unknown",
|
|
271
|
+
notes="Heuristic bot detector composited from existing primitives. "
|
|
272
|
+
"Classifies as likely_bot when conversation_initiation_rate ≥ 0.95 "
|
|
273
|
+
"AND attention_pattern = broadcast AND vocabulary_richness < 0.65. "
|
|
274
|
+
"Empirically validated 2026-05-03 against the tdl-labeled Rutify "
|
|
275
|
+
"bot SangMata_beta_bot (correctly caught) vs 11 high-volume humans "
|
|
276
|
+
"in the same corpus (none false-positive). NOT a verdict — engines "
|
|
277
|
+
"should treat as a candidate signal, especially since low-volume "
|
|
278
|
+
"bots (e.g. QuotLyBot with 9 messages) sit below the fingerprint "
|
|
279
|
+
"threshold and emit nothing here. Source label declares the "
|
|
280
|
+
"heuristic version (e.g. #bot-heuristic-v1).",
|
|
281
|
+
),
|
|
282
|
+
"network.governance_role_signal": _cat(
|
|
283
|
+
"admin_pattern", "responder_pattern", "regular", "bot_pattern", "unknown",
|
|
284
|
+
notes="Heuristic role-shape composited from interaction primitives + "
|
|
285
|
+
"lifecycle_phase. admin_pattern: init_rate ≥ 0.80 AND attn = "
|
|
286
|
+
"reciprocal AND non-bot AND not arrival_burst. responder_pattern: "
|
|
287
|
+
"init_rate ≤ 0.45 AND attn = reciprocal. bot_pattern: matches "
|
|
288
|
+
"network.is_likely_bot likely_bot. regular: everything else above "
|
|
289
|
+
"the volume threshold. Empirically caught all 4 high-volume "
|
|
290
|
+
"tdl-labeled Rutify admins, sebaImlI as responder, "
|
|
291
|
+
"SangMata_beta_bot as bot, OxPayload/bopxcx as regular (their "
|
|
292
|
+
"arrival_burst lifecycle overrides the admin-shaped init_rate). "
|
|
293
|
+
"NOT a ground-truth admin label — kkaxlazer matches admin_pattern "
|
|
294
|
+
"while not formally admin, but the 2026-05-03 reply-graph cohort "
|
|
295
|
+
"analysis showed they're operationally embedded in the admin "
|
|
296
|
+
"layer (4/4 cohort signal with the top admin), so the heuristic "
|
|
297
|
+
"is doing the right thing.",
|
|
298
|
+
),
|
|
299
|
+
|
|
300
|
+
# ── interaction.* (temporal analog — 6) ───────────────────────────────
|
|
301
|
+
"interaction.response_latency_class": _cat(
|
|
302
|
+
"immediate", "fast", "normal", "slow", "sporadic",
|
|
303
|
+
notes="How quickly the actor responds to messages directed at them. "
|
|
304
|
+
"immediate=<30s (suggests active monitoring or automated response). "
|
|
305
|
+
"fast=30s-5min. normal=5-60min (typical async chat). slow=1-24hr. "
|
|
306
|
+
"sporadic=no consistent response latency — appears and disappears.",
|
|
307
|
+
),
|
|
308
|
+
"interaction.conversation_initiation_rate": _num(min_val=0.0, max_val=1.0,
|
|
309
|
+
notes="thread-starting messages / total"),
|
|
310
|
+
"interaction.message_burst_rate": _cat(
|
|
311
|
+
"single", "occasional", "habitual",
|
|
312
|
+
notes="Whether the actor sends multiple messages in rapid sequence within a "
|
|
313
|
+
"conversation turn. habitual=almost always bursts (sends 3+ messages "
|
|
314
|
+
"before any reply). single=almost always one message per turn. Tied to "
|
|
315
|
+
"stylometric.linebreak_style multi_line.",
|
|
316
|
+
),
|
|
317
|
+
"interaction.active_hours_class": _str(notes="UTC active-hours window summary"),
|
|
318
|
+
"interaction.session_duration_class": _cat("short", "medium", "long", "marathon",
|
|
319
|
+
notes="REUSED enum from BEHAVE-SHELL temporal.session_duration"),
|
|
320
|
+
"interaction.attention_pattern": _cat("broadcast", "focused", "reciprocal",
|
|
321
|
+
notes="from reply-graph centrality"),
|
|
322
|
+
|
|
323
|
+
# ── content.* (operational analog — 6, EXPERIMENTAL) ──────────────────
|
|
324
|
+
"content.role_signal": _cat("admin", "seller", "buyer", "lurker", "newbie",
|
|
325
|
+
notes="EXPERIMENTAL — locale-tuned role-vocabulary classifier; "
|
|
326
|
+
"may be moved to a separate IOC/keyword-detection layer "
|
|
327
|
+
"once tested against the Rutify corpus"),
|
|
328
|
+
"content.transactional_language": _num(min_val=0.0, max_val=1.0,
|
|
329
|
+
notes="EXPERIMENTAL — rate of transactional terms; "
|
|
330
|
+
"locale-specific, brittle to vocabulary drift"),
|
|
331
|
+
"content.opsec_awareness": _num(min_val=0.0, max_val=1.0,
|
|
332
|
+
notes="EXPERIMENTAL — rate of security-conscious phrases; "
|
|
333
|
+
"HIGH FALSE-POSITIVE RISK on casual conversation about "
|
|
334
|
+
"deleting files / messages"),
|
|
335
|
+
"content.targeting_language": _array(ValueKind.FREE_STRING,
|
|
336
|
+
notes="EXPERIMENTAL — IOC-shaped target patterns "
|
|
337
|
+
"(bank names, government portals, RUT ranges, etc); "
|
|
338
|
+
"consider moving to dedicated IOC layer"),
|
|
339
|
+
"content.boasting_pattern": _cat("none", "occasional", "frequent",
|
|
340
|
+
notes="EXPERIMENTAL — success-claim regex; corpus-dependent"),
|
|
341
|
+
"content.conflict_style": _cat("aggressive", "defusing", "appellate",
|
|
342
|
+
notes="EXPERIMENTAL — dispute-tone classifier; needs "
|
|
343
|
+
"labelled training data"),
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def is_known(primitive: str) -> bool:
|
|
348
|
+
return primitive in PRIMITIVE_REGISTRY
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def get(primitive: str) -> ValueTypeSpec:
|
|
352
|
+
"""Return the value-type spec for *primitive*; raise KeyError if unknown."""
|
|
353
|
+
return PRIMITIVE_REGISTRY[primitive]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: behave-text
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: BEHAVE-TEXT — text/messaging-domain behavioral observation registry, layered on behave-core
|
|
5
|
+
Author: ANTI
|
|
6
|
+
License: GPL-3.0-or-later
|
|
7
|
+
Project-URL: Source, https://git.resacachile.cl/anti/BEHAVE
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Requires-Dist: pydantic>=2.6
|
|
10
|
+
Requires-Dist: behave-core>=0.1.0
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
13
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
14
|
+
Requires-Dist: ruff; extra == "dev"
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
behave_text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
behave_text/spec/__init__.py,sha256=J-Lx1_8cKi0aNKyv8zdzxG3PTWdJTvn96ZbzjMgKM8s,1474
|
|
3
|
+
behave_text/spec/envelope.py,sha256=imKzM2jPivPaEZia_Uq0ZLcPU3kis4ds-Sqkb2l1lZ4,1745
|
|
4
|
+
behave_text/spec/primitives.py,sha256=8ZX7S2618B-Euclq0uKx7KHa8lk1GK2IZ9v4OUret1g,22155
|
|
5
|
+
behave_text-0.1.0.dist-info/METADATA,sha256=zr9whWTRof2utyQz2XP8cQVLDiC2Vz3XOrYWiOcABa4,483
|
|
6
|
+
behave_text-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
behave_text-0.1.0.dist-info/top_level.txt,sha256=2XZom1-c2zsAW27NyrPj2pddXVZ3Go1q4HGiNhzvEq4,12
|
|
8
|
+
behave_text-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
behave_text
|