narsche 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
narsche-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [year] [fullname]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
narsche-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,84 @@
1
+ Metadata-Version: 2.4
2
+ Name: narsche
3
+ Version: 0.1.0
4
+ Summary: Computing schematicity of autobiographical narratives
5
+ Author-email: Isaac Kinley <isaac.kinley@gmail.com>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: networkx
11
+ Requires-Dist: spacy
12
+ Requires-Dist: wordfreq
13
+ Dynamic: license-file
14
+
15
+
16
+ # Measuring narrative schematicity
17
+
18
+ [![codecov](https://codecov.io/github/kinleyid/narsche/graph/badge.svg?token=EHCYVTZWCE)](https://codecov.io/github/kinleyid/narsche)
19
+
20
+ Methods from the paper "Computational Tools for Quantifying Schemas in Autobiographical Narratives".
21
+
22
+ ## Installation
23
+
24
+ [Under construction until this project is on PyPI]
25
+
26
+ Depends on `networkx` (for network models), `SpaCy` (for tokenization), and `wordfreq` for automated topic identification. Additionally, one of `SpaCy`'s models must be downloaded (see their instructions on how to do this).
27
+
28
+ ## Usage
29
+
30
+ ### Loading and saving models
31
+
32
+ A text file of word vectors can be read using the `read_vectors()` function:
33
+
34
+ ```python
35
+ vec_mod = narsche.read_vectors('/path/to/vectors.txt')
36
+ ```
37
+
38
+ This produces a vector model. Initializing a network model requires first loading a `networkx.Graph` object:
39
+
40
+ ```python
41
+ import networkx as nx
42
+
43
+ graph = nx.load('/path/to/graph')
44
+ net_mod = narsche.NetworkModel(graph)
45
+ ```
46
+
47
+ Models can be saved using the `save()` method and loaded using the `load()` class method:
48
+
49
+ ```python
50
+ net_mod.save('network.mod')
51
+ net_mod = narsche.NetworkModel.load('network.mod')
52
+
53
+ vec_mod.save('vector.mod')
54
+ vec_mod = narsche.VectorModel.load('vector.mod')
55
+ ```
56
+
57
+ These are just wrappers around `pickle.[load/dump]`. Any extension can be used.
58
+
59
+ ### Tokenizing narratives
60
+
61
+ Before schematicity can be computed, narratives must be tokenized, i.e., converted to a list of tokens. For this, there is a `Tokenizer()` class that relies on `SpaCy`:
62
+
63
+ ```python
64
+ txt = 'I sat on the sofa in my living room with a lamp' # Example text
65
+ tokenizer = narsche.Tokenizer('en_core_web_sm') # Initialize tokenizer
66
+ words = tokenizer.tokenize(txt) # Tokenize words
67
+ words = vec_mod.keep_known(words) # Use only those words that are in the model
68
+ ```
69
+
70
+ ### Computing schematicity
71
+
72
+ Given a model and a set of tokens (and possibly a topic word), schematicity can be computed using the `schematicity()` function:
73
+
74
+ ```python
75
+ topic = narsche.identify_topic(words) # Identify the topic
76
+ # Compute schematicity
77
+ narsche.schematicity(
78
+ words=words,
79
+ model=mod,
80
+ method='on-topic-ppn', # or topic-relatedness, pairwise-relatedness, or component-size
81
+ topic=topic)
82
+ ```
83
+
84
+ See the documentation of the `schematicity()` function for kewords required by other methods.
@@ -0,0 +1,70 @@
1
+
2
+ # Measuring narrative schematicity
3
+
4
+ [![codecov](https://codecov.io/github/kinleyid/narsche/graph/badge.svg?token=EHCYVTZWCE)](https://codecov.io/github/kinleyid/narsche)
5
+
6
+ Methods from the paper "Computational Tools for Quantifying Schemas in Autobiographical Narratives".
7
+
8
+ ## Installation
9
+
10
+ [Under construction until this project is on PyPI]
11
+
12
+ Depends on `networkx` (for network models), `SpaCy` (for tokenization), and `wordfreq` for automated topic identification. Additionally, one of `SpaCy`'s models must be downloaded (see their instructions on how to do this).
13
+
14
+ ## Usage
15
+
16
+ ### Loading and saving models
17
+
18
+ A text file of word vectors can be read using the `read_vectors()` function:
19
+
20
+ ```python
21
+ vec_mod = narsche.read_vectors('/path/to/vectors.txt')
22
+ ```
23
+
24
+ This produces a vector model. Initializing a network model requires first loading a `networkx.Graph` object:
25
+
26
+ ```python
27
+ import networkx as nx
28
+
29
+ graph = nx.load('/path/to/graph')
30
+ net_mod = narsche.NetworkModel(graph)
31
+ ```
32
+
33
+ Models can be saved using the `save()` method and loaded using the `load()` class method:
34
+
35
+ ```python
36
+ net_mod.save('network.mod')
37
+ net_mod = narsche.NetworkModel.load('network.mod')
38
+
39
+ vec_mod.save('vector.mod')
40
+ vec_mod = narsche.VectorModel.load('vector.mod')
41
+ ```
42
+
43
+ These are just wrappers around `pickle.[load/dump]`. Any extension can be used.
44
+
45
+ ### Tokenizing narratives
46
+
47
+ Before schematicity can be computed, narratives must be tokenized, i.e., converted to a list of tokens. For this, there is a `Tokenizer()` class that relies on `SpaCy`:
48
+
49
+ ```python
50
+ txt = 'I sat on the sofa in my living room with a lamp' # Example text
51
+ tokenizer = narsche.Tokenizer('en_core_web_sm') # Initialize tokenizer
52
+ words = tokenizer.tokenize(txt) # Tokenize words
53
+ words = vec_mod.keep_known(words) # Use only those words that are in the model
54
+ ```
55
+
56
+ ### Computing schematicity
57
+
58
+ Given a model and a set of tokens (and possibly a topic word), schematicity can be computed using the `schematicity()` function:
59
+
60
+ ```python
61
+ topic = narsche.identify_topic(words) # Identify the topic
62
+ # Compute schematicity
63
+ narsche.schematicity(
64
+ words=words,
65
+ model=mod,
66
+ method='on-topic-ppn', # or topic-relatedness, pairwise-relatedness, or component-size
67
+ topic=topic)
68
+ ```
69
+
70
+ See the documentation of the `schematicity()` function for kewords required by other methods.
@@ -0,0 +1,465 @@
1
+ import pickle
2
+ import numpy as np
3
+ from wordfreq import word_frequency
4
+ from collections import Counter
5
+ import networkx as nx
6
+ import spacy
7
+ import sys
8
+ from pdb import set_trace
9
+
10
+ epsilon = sys.float_info.epsilon
11
+
12
+
13
+ def get_pairs(iterable):
14
+ """
15
+ Gets adjacent pairs of items from some iterable collection.
16
+
17
+ Args:
18
+ iterable (iterable): some iterable collection (e.g., a list or tuple)
19
+
20
+ Returns:
21
+ pairs: adjacent pairs or items in a list
22
+ """
23
+ pairs = [(a, b) for i, a in enumerate(iterable) for b in iterable[(i + 1) :]]
24
+ return pairs
25
+
26
+
27
+ def identify_topic(words):
28
+ """
29
+ Identifies the topic using tf-idf
30
+
31
+ Args:
32
+ words (list): a list of words
33
+
34
+ Returns:
35
+ topic (str): the topic word
36
+ """
37
+ bag = np.array(sorted(list(set(words)))) # Unique words
38
+
39
+ """
40
+ # Compute tf-idf
41
+ tf = np.array([words.count(word) for word in bag])
42
+ df = np.array([word_frequency(word=word, lang='en') for word in bag])
43
+ idf = np.log(1 / df)
44
+ tf_idf = tf*idf
45
+ """
46
+
47
+ tf = Counter(words)
48
+ # Compute doc frequency (could be 0)
49
+ df = {word: word_frequency(word=word, lang="en", minimum=epsilon) for word in bag}
50
+ # Recompute bag
51
+ bag = np.array([word for word in df if df[word] > 0])
52
+ idf = {word: np.log(1 / df[word]) for word in bag}
53
+ tf_idf = np.array([tf[word] * idf[word] for word in bag])
54
+ # Sort
55
+ sort_idx = np.argsort(-tf_idf) # Negative to be in descending
56
+ sorted_bag = bag[sort_idx]
57
+ # Get topic (top ranked)
58
+ return str(sorted_bag[0])
59
+
60
+
61
+ def read_vectors(file, encoding="utf-8"):
62
+ """
63
+ Reads word vectors from a text file. Each line of the file should be formatted <word> <dim1> <dim2> <dim3> ... Vectors are automatically normalized
64
+
65
+ Args:
66
+ file (str): name of text file
67
+ encoding (str): encoding used when reading the file
68
+
69
+ Returns:
70
+ model (VectorModel): vectors in a VectorModel
71
+ """
72
+ words = []
73
+ vectors = []
74
+ with open(file, "r", encoding=encoding) as f:
75
+ for line in f:
76
+ # First item in space-delimited line is token, remaining items are vector elements
77
+ split_line = line.rstrip("\n").split(" ")
78
+ words.append(split_line[0])
79
+ # Normalize vector for fast dot product-based cosine similarity computation
80
+ vector = np.asarray(split_line[1:]).astype(np.float32)
81
+ # vector /= np.linalg.norm(vector)
82
+ vectors.append(vector)
83
+ vectors = np.array(vectors)
84
+ # Normalize
85
+ norms = np.linalg.norm(vectors, axis=1, keepdims=True)
86
+ vectors /= norms
87
+ return VectorModel(words, vectors)
88
+
89
+
90
+ class Model:
91
+ def save(self, path):
92
+ with open(path, "wb") as f:
93
+ pickle.dump(self, f)
94
+
95
+ @classmethod
96
+ def load(cls, path):
97
+ with open(path, "rb") as f:
98
+ obj = pickle.load(f)
99
+ if not isinstance(obj, cls):
100
+ raise TypeError(
101
+ f"Expected instance of %s, got %s" % (cls.__name__, type(obj).__name__)
102
+ )
103
+ return obj
104
+
105
+ def keep_known(self, words):
106
+ return [word for word in words if word in self]
107
+
108
+
109
+ class VectorModel(Model):
110
+ """
111
+ Vector-based model
112
+
113
+ Attributes:
114
+ words (list of str): words for which the model has vectors
115
+ vectors (matrix): numpy matrix containing the vectors (in the same order as the word list)
116
+ """
117
+
118
+ def __init__(self, words, vectors):
119
+ """
120
+ Initializes vector model
121
+
122
+ Args:
123
+ words (list of str): words in model
124
+ vectors (numpy array): array of corresponding vectors
125
+ """
126
+ if isinstance(words, list):
127
+ if not all(isinstance(word, str) for word in words):
128
+ raise ValueError("words is not a list of strings")
129
+ if not isinstance(vectors, np.ndarray):
130
+ raise ValueError("vectors is not an np.ndarray")
131
+ if len(words) != len(vectors):
132
+ raise ValueError("different numbers of words and vectors")
133
+ # Store efficiently---list of words, matrix of vectors, and index
134
+ self.words = words
135
+ self.vectors = vectors
136
+
137
+ def __contains__(self, word):
138
+ return word in self.words
139
+
140
+ def in_model(self, word):
141
+ return word in self.words
142
+
143
+ def compute_sim(self, word1, word2):
144
+ """
145
+ Compute cosine similarity between words
146
+
147
+ Args:
148
+ word1 (str): first word
149
+ word2 (str): second word
150
+
151
+ Returns:
152
+ sim (float): cosine similarity (nan if either word is not in the model)
153
+ """
154
+ # Compute similarity
155
+ if word1 in self.words and word2 in self.words:
156
+ i1, i2 = self.words.index(word1), self.words.index(word2)
157
+ v1, v2 = self.vectors[i1], self.vectors[i2]
158
+ sim = np.dot(v1, v2)
159
+ else:
160
+ sim = float("nan")
161
+ return sim
162
+
163
+ def get_lexicon(self, topic, top_n=10000, including_topic=True):
164
+ """
165
+ Get "lexicon" of words most related to a topic
166
+
167
+ Args:
168
+ topic (str): topic word
169
+ top_n (int): size of lexicon
170
+ including_topic (bool): should the topic word be included in the lexicon?
171
+
172
+ Returns:
173
+ lexicon (list of str): list of words most related to the topic
174
+ """
175
+ # Get lexicon of words most related to <topic>
176
+
177
+ # First compute similarities (faster than constructing new matrix not including topic)
178
+ topic_vector = self.vectors[self.words.index(topic)]
179
+ similarities = np.matmul(self.vectors, topic_vector)
180
+ # Sort by similarity
181
+ sort_idx = np.argsort(similarities)
182
+ sorted_words = [self.words[i] for i in sort_idx]
183
+ # Remove topic word itself?
184
+ if not including_topic:
185
+ sorted_words.pop(sorted_words.index(topic))
186
+ # Pare down
187
+ lexicon = sorted_words[-top_n:]
188
+ return lexicon
189
+
190
+ def as_graph(self, threshold, words=None):
191
+ """
192
+ Convert vector model to network model
193
+
194
+ Args:
195
+ threshold (float): only pairs of words whose cosine similarity is greater than or equal to this threshold will share an edge in the resulting network
196
+ words (list of str): for speed, only this subset of words will be used to produce the network (rather than all words in the vector-based model)
197
+
198
+ Returns:
199
+ model (NetworkModel): graph-based model
200
+ """
201
+
202
+ # Get only those tokens that are actually in current dictionary
203
+ if words != None:
204
+ words = [w for w in words if w in self.words]
205
+ else:
206
+ words = self.words
207
+ pairs = get_pairs(words)
208
+ graph = nx.Graph()
209
+ edges = []
210
+ for word1, word2 in pairs:
211
+ sim = self.compute_sim(word1, word2)
212
+ if sim >= threshold:
213
+ graph.add_edge(word1, word2, strength=sim)
214
+ # Create network model
215
+ return NetworkModel(graph)
216
+
217
+
218
+ class NetworkModel(Model):
219
+ """
220
+ Network-based model
221
+
222
+ Attributes:
223
+ graph (networkx.Graph): graph of words
224
+ """
225
+
226
+ def __init__(self, graph):
227
+ """
228
+ Initializes vector model
229
+
230
+ Args:
231
+ graph (networkx.Graph): network of words whose edges include a "strength" attribute
232
+ """
233
+ if not isinstance(graph, nx.Graph):
234
+ raise TypeError(f"Expected a networkx.Graph, got %s" % type(graph).__name__)
235
+ for u, v, data in graph.edges(data=True):
236
+ if "strength" not in data:
237
+ raise ValueError(
238
+ f"Edge (%s, %s) is missing 'strength' attribute" % (u, v)
239
+ )
240
+ # Compute inverse strength
241
+ inv_strength = {
242
+ (a, b): 1 / data["strength"] for a, b, data in graph.edges(data=True)
243
+ }
244
+ nx.set_edge_attributes(graph, inv_strength, "inv_strength")
245
+ self.graph = graph
246
+
247
+ def __contains__(self, word):
248
+ return word in self.graph
249
+
250
+ def in_model(self, word):
251
+ return word in self.graph
252
+
253
+ def compute_sim(self, word1, word2):
254
+ """
255
+ Compute efficiency-based similarity (i.e., the length of the shortest path between words)
256
+
257
+ Args:
258
+ word1 (str): first word
259
+ word2 (str): second word
260
+
261
+ Returns:
262
+ efficiency (float): efficiency-based similarity measure
263
+ """
264
+ # Compute similarity by local efficiency metric
265
+
266
+ if word1 in self.graph and word2 in self.graph:
267
+ try:
268
+ distance, path = nx.bidirectional_dijkstra(
269
+ self.graph, word1, word2, weight="inv_strength"
270
+ )
271
+ efficiency = 1 / distance
272
+ except:
273
+ # No path between nodes
274
+ efficiency = 0
275
+ else:
276
+ efficiency = float("nan")
277
+ return efficiency
278
+
279
+ def get_lexicon(self, topic, max_steps=2, including_topic=True):
280
+ """
281
+ Get "lexicon" of words most related to a topic. This function is a wrapper around networkx.ego_graph()
282
+
283
+ Args:
284
+ topic (str): topic word
285
+ max_steps (int): number of steps to traverse to identify related words
286
+ including_topic (bool): should the topic word be included in the lexicon?
287
+
288
+ Returns:
289
+ lexicon (list of str): list of words most related to the topic
290
+ """
291
+ ego_graph = nx.ego_graph(
292
+ self.graph, n=topic, radius=max_steps, center=including_topic, distance=None
293
+ ) # Make sure this binarizes the strengths
294
+ lexicon = [w for w in ego_graph]
295
+ return lexicon
296
+
297
+ def largest_component(self, words):
298
+ """
299
+ Get largest network component in the subgraph induced by words
300
+
301
+ Args:
302
+ words (list of str): words by which to induce subgraph
303
+
304
+ Returns:
305
+ component (networkx.Graph): largest network component
306
+ """
307
+ subgraph = self.graph.subgraph(words)
308
+ components = nx.connected_components(subgraph)
309
+ components_by_size = list(sorted(components, key=len, reverse=True))
310
+ if len(components_by_size) == 0:
311
+ # No component
312
+ largest_component = nx.Graph()
313
+ else:
314
+ # Get words in largest component
315
+ largest_component = components_by_size[0]
316
+ return largest_component
317
+
318
+
319
+ class Tokenizer:
320
+ """
321
+ SpaCy-based tokenizer
322
+
323
+ Attributes:
324
+ nlp (spacy model): SpaCy model used to tokenize
325
+ """
326
+
327
+ def __init__(self, spacy_model="en_core_web_sm"):
328
+ self.nlp = spacy.load(spacy_model)
329
+
330
+ def _lemmatize_token(self, token):
331
+ return token.lemma_.lower()
332
+
333
+ def _lemmatize(self, text):
334
+ doc = self.nlp(text)
335
+ return [self._lemmatize_token(tok) for tok in doc]
336
+
337
+ def _is_content(self, tok):
338
+ return tok.pos_ in ("NOUN", "VERB", "ADJ", "ADV")
339
+
340
+ def tokenize(
341
+ self, text, rm_stops=True, only_content=True, lemmatize=False, lowercase=True
342
+ ):
343
+ """
344
+ Tokenize text (lowercase and keep only non-stop content words)
345
+
346
+ Args:
347
+ text (str): text to be tokenized
348
+ rm_stops (bool): remove stopwords
349
+ only_content (bool): keep only content words (nounds, verbs, adjectives, adverbs)
350
+ lemmatize (bool): lemmatize tokens
351
+ lowercase (bool): convert words to lowercase
352
+
353
+ Returns:
354
+ tokens (list of str): list of tokens
355
+ """
356
+ doc = self.nlp(text)
357
+ tokenized = []
358
+ for tok in doc:
359
+
360
+ keep_word = True
361
+
362
+ if rm_stops and tok.is_stop:
363
+ keep_word = False
364
+
365
+ if only_content and tok.pos_ not in ("NOUN", "VERB", "ADJ", "ADV"):
366
+ keep_word = False
367
+
368
+ if keep_word:
369
+
370
+ if lemmatize:
371
+ word = tok.lemma_
372
+ else:
373
+ word = tok.text
374
+
375
+ if lowercase:
376
+ word = word.lower()
377
+
378
+ tokenized.append(word)
379
+
380
+ return tokenized
381
+
382
+
383
+ def schematicity(words, model, method, topic=None, pairs=None, lexsize=None):
384
+ """
385
+ Compute schematicity
386
+
387
+ Args:
388
+ words (list of str): tokens from a narrative
389
+ model (VectorModel or NetworkModel): model to use for computing schematicity
390
+ method (str): method of computing schematicity ('on-topic-ppn', 'topic-relatedness', 'pairwise-relatedness', or 'component-size')
391
+ topic (str): topic word for topic-based methods
392
+ pairs (str): for pairwise-relatedness, which pairs should be used ('all' for all pairs, 'adj' for bigrams/adjacent pairs)
393
+ lexsize (int): for on-topic-ppn, this parameter is passed to the .get_lexicon() method of the model
394
+
395
+ Returns:
396
+ schem (float): schematicity measure
397
+ """
398
+
399
+ # Validation
400
+ if type(words) is not list:
401
+ raise ValueError("words must be a list")
402
+ if len(words) == 0:
403
+ raise ValueError("words is empty")
404
+ if not all(type(word) is str for word in words):
405
+ raise ValueError("all words must be strings")
406
+ valid_methods = [
407
+ "on-topic-ppn",
408
+ "topic-relatedness",
409
+ "pairwise-relatedness",
410
+ "component-size",
411
+ ]
412
+ if method not in valid_methods:
413
+ raise ValueError("method must be one of %s" % valid_methods)
414
+
415
+ if method in ["on-topic-ppn", "topic-relatedness"]:
416
+ if topic == None:
417
+ raise ValueError('topic must be specified for method "%s"' % method)
418
+ elif topic not in model:
419
+ raise ValueError('topic "%s" is not in model' % topic)
420
+ elif method == "pairwise-relatedness":
421
+ if pairs not in ["all", "adj"]:
422
+ raise ValueError(
423
+ 'pairs must be one of "all", "adj" for method "pairwise-relatedness"'
424
+ )
425
+ elif method == "component-size":
426
+ if not isinstance(model, NetworkModel):
427
+ raise ValueError('model must be a NetworkModel for method "component-size"')
428
+
429
+ if not all(word in model for word in words):
430
+ raise ValueError(
431
+ "not all words are in model. Use .keep_known() to filter out words not in the model"
432
+ )
433
+
434
+ if method == "on-topic-ppn":
435
+ if isinstance(model, VectorModel):
436
+ kwargs = {} if lexsize == None else {"top_n": lexsize}
437
+ elif isinstance(model, NetworkModel):
438
+ kwargs = {} if lexsize == None else {"max_steps": lexsize}
439
+ else:
440
+ raise ValueError("model must be a VectorModel or NetworkModel")
441
+ lexicon = model.get_lexicon(topic, **kwargs)
442
+ on_topic = [t in lexicon for t in words]
443
+ ppn_on_topic = np.mean(on_topic)
444
+ return ppn_on_topic
445
+
446
+ elif method == "topic-relatedness":
447
+ sims = [model.compute_sim(word, topic) for word in words]
448
+ return np.mean(sims)
449
+
450
+ elif method == "pairwise-relatedness":
451
+ # Get word pairs
452
+ if pairs == "all":
453
+ word_pairs = get_pairs(words)
454
+ elif pairs in ["adj", "adjacent"]:
455
+ word_pairs = list(zip(words[:-1], words[1:]))
456
+ else:
457
+ raise ValueError('unrecognized pairs option "%s"' % pairs)
458
+ # Compute average pairwise similarity
459
+ sims = [model.compute_sim(*word_pair) for word_pair in word_pairs]
460
+ return np.mean(sims)
461
+
462
+ elif method == "component-size":
463
+ # Get largest fully-connected component
464
+ largest_component = model.largest_component(words)
465
+ return len(largest_component) / len(words)
@@ -0,0 +1,84 @@
1
+ Metadata-Version: 2.4
2
+ Name: narsche
3
+ Version: 0.1.0
4
+ Summary: Computing schematicity of autobiographical narratives
5
+ Author-email: Isaac Kinley <isaac.kinley@gmail.com>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.9
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: networkx
11
+ Requires-Dist: spacy
12
+ Requires-Dist: wordfreq
13
+ Dynamic: license-file
14
+
15
+
16
+ # Measuring narrative schematicity
17
+
18
+ [![codecov](https://codecov.io/github/kinleyid/narsche/graph/badge.svg?token=EHCYVTZWCE)](https://codecov.io/github/kinleyid/narsche)
19
+
20
+ Methods from the paper "Computational Tools for Quantifying Schemas in Autobiographical Narratives".
21
+
22
+ ## Installation
23
+
24
+ [Under construction until this project is on PyPI]
25
+
26
+ Depends on `networkx` (for network models), `SpaCy` (for tokenization), and `wordfreq` for automated topic identification. Additionally, one of `SpaCy`'s models must be downloaded (see their instructions on how to do this).
27
+
28
+ ## Usage
29
+
30
+ ### Loading and saving models
31
+
32
+ A text file of word vectors can be read using the `read_vectors()` function:
33
+
34
+ ```python
35
+ vec_mod = narsche.read_vectors('/path/to/vectors.txt')
36
+ ```
37
+
38
+ This produces a vector model. Initializing a network model requires first loading a `networkx.Graph` object:
39
+
40
+ ```python
41
+ import networkx as nx
42
+
43
+ graph = nx.load('/path/to/graph')
44
+ net_mod = narsche.NetworkModel(graph)
45
+ ```
46
+
47
+ Models can be saved using the `save()` method and loaded using the `load()` class method:
48
+
49
+ ```python
50
+ net_mod.save('network.mod')
51
+ net_mod = narsche.NetworkModel.load('network.mod')
52
+
53
+ vec_mod.save('vector.mod')
54
+ vec_mod = narsche.VectorModel.load('vector.mod')
55
+ ```
56
+
57
+ These are just wrappers around `pickle.[load/dump]`. Any extension can be used.
58
+
59
+ ### Tokenizing narratives
60
+
61
+ Before schematicity can be computed, narratives must be tokenized, i.e., converted to a list of tokens. For this, there is a `Tokenizer()` class that relies on `SpaCy`:
62
+
63
+ ```python
64
+ txt = 'I sat on the sofa in my living room with a lamp' # Example text
65
+ tokenizer = narsche.Tokenizer('en_core_web_sm') # Initialize tokenizer
66
+ words = tokenizer.tokenize(txt) # Tokenize words
67
+ words = vec_mod.keep_known(words) # Use only those words that are in the model
68
+ ```
69
+
70
+ ### Computing schematicity
71
+
72
+ Given a model and a set of tokens (and possibly a topic word), schematicity can be computed using the `schematicity()` function:
73
+
74
+ ```python
75
+ topic = narsche.identify_topic(words) # Identify the topic
76
+ # Compute schematicity
77
+ narsche.schematicity(
78
+ words=words,
79
+ model=mod,
80
+ method='on-topic-ppn', # or topic-relatedness, pairwise-relatedness, or component-size
81
+ topic=topic)
82
+ ```
83
+
84
+ See the documentation of the `schematicity()` function for kewords required by other methods.
@@ -0,0 +1,10 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ narsche/__init__.py
5
+ narsche.egg-info/PKG-INFO
6
+ narsche.egg-info/SOURCES.txt
7
+ narsche.egg-info/dependency_links.txt
8
+ narsche.egg-info/requires.txt
9
+ narsche.egg-info/top_level.txt
10
+ tests/test_narsche.py
@@ -0,0 +1,3 @@
1
+ networkx
2
+ spacy
3
+ wordfreq
@@ -0,0 +1,3 @@
1
+ dist
2
+ narsche
3
+ tests
@@ -0,0 +1,20 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "narsche"
7
+ version = "0.1.0"
8
+ description = "Computing schematicity of autobiographical narratives"
9
+ authors = [
10
+ { name="Isaac Kinley", email="isaac.kinley@gmail.com" }
11
+ ]
12
+ readme = "README.md"
13
+ license = "MIT"
14
+ dependencies = [
15
+ "networkx", "spacy", "wordfreq"
16
+ ]
17
+ requires-python = ">=3.9"
18
+
19
+ [tool.setuptools.packages.find]
20
+ where = ["."]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,103 @@
1
+ import pytest
2
+ import narsche
3
+ import os
4
+ from pdb import set_trace
5
+
6
+
7
+ @pytest.fixture
8
+ def example_words():
9
+ return ["sitting", "lamp", "desk", "office"]
10
+
11
+
12
+ @pytest.fixture
13
+ def cur_dir():
14
+ test_dir = os.path.dirname(__file__)
15
+ return test_dir
16
+
17
+
18
+ @pytest.fixture
19
+ def vector_mod(cur_dir):
20
+ sample_vec_file = os.path.join(cur_dir, "sample-vectors.txt")
21
+ mod = narsche.read_vectors(sample_vec_file)
22
+ return mod
23
+
24
+
25
+ @pytest.fixture
26
+ def network_mod(vector_mod):
27
+ return vector_mod.as_graph(threshold=0.9)
28
+
29
+
30
+ def test_topic_identification():
31
+ narsche.identify_topic(["chair", "sofa", "living", "room", "wall", "picture"])
32
+
33
+
34
+ def test_read_vector(vector_mod):
35
+ assert isinstance(vector_mod, narsche.VectorModel)
36
+
37
+
38
+ def test_save_vector_model(vector_mod, cur_dir):
39
+ vector_mod.save(os.path.join(cur_dir, "vector-model.mod"))
40
+
41
+
42
+ def test_load_vector_model(cur_dir):
43
+ narsche.VectorModel.load(os.path.join(cur_dir, "vector-model.mod"))
44
+
45
+
46
+ def test_vector_model_methods(vector_mod):
47
+ assert "lamp" in vector_mod
48
+ vector_mod.compute_sim("lamp", "desk")
49
+ vector_mod.get_lexicon("lamp", top_n=2, including_topic=True)
50
+ vector_mod.get_lexicon("lamp", top_n=2, including_topic=False)
51
+
52
+
53
+ def test_as_graph(vector_mod, network_mod):
54
+ assert isinstance(network_mod, narsche.NetworkModel)
55
+ vector_mod.as_graph(words=["lamp", "desk", "pottery"], threshold=0.3)
56
+
57
+
58
+ def test_save_network_model(network_mod, cur_dir):
59
+ network_mod.save(os.path.join(cur_dir, "network-model.mod"))
60
+
61
+
62
+ def test_load_network_model(cur_dir):
63
+ narsche.NetworkModel.load(os.path.join(cur_dir, "network-model.mod"))
64
+
65
+
66
+ def test_network_model_methods(network_mod):
67
+ assert "lamp" in network_mod
68
+ network_mod.compute_sim("lamp", "desk")
69
+ network_mod.get_lexicon("lamp", max_steps=1, including_topic=True)
70
+ network_mod.get_lexicon("lamp", max_steps=1, including_topic=False)
71
+ network_mod.largest_component(["lamp", "desk"])
72
+
73
+
74
+ def test_tokenizer():
75
+ tokenizer = narsche.Tokenizer()
76
+ tokenizer.tokenize("This is a short piece of text")
77
+
78
+
79
+ def test_schematicity_vector_model(vector_mod, example_words):
80
+ words = vector_mod.keep_known(example_words)
81
+ narsche.schematicity(
82
+ model=vector_mod, words=words, method="on-topic-ppn", topic="lamp"
83
+ )
84
+ narsche.schematicity(
85
+ model=vector_mod, words=words, method="topic-relatedness", topic="lamp"
86
+ )
87
+ narsche.schematicity(
88
+ model=vector_mod, words=words, method="pairwise-relatedness", pairs="adj"
89
+ )
90
+
91
+
92
+ def test_schematicity_network_model(network_mod, example_words):
93
+ words = network_mod.keep_known(example_words)
94
+ narsche.schematicity(
95
+ model=network_mod, words=words, method="on-topic-ppn", topic="lamp"
96
+ )
97
+ narsche.schematicity(
98
+ model=network_mod, words=words, method="topic-relatedness", topic="lamp"
99
+ )
100
+ narsche.schematicity(
101
+ model=network_mod, words=words, method="pairwise-relatedness", pairs="adj"
102
+ )
103
+ narsche.schematicity(model=network_mod, words=words, method="component-size")