narsche 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- narsche-0.1.0/LICENSE +21 -0
- narsche-0.1.0/PKG-INFO +84 -0
- narsche-0.1.0/README.md +70 -0
- narsche-0.1.0/narsche/__init__.py +465 -0
- narsche-0.1.0/narsche.egg-info/PKG-INFO +84 -0
- narsche-0.1.0/narsche.egg-info/SOURCES.txt +10 -0
- narsche-0.1.0/narsche.egg-info/dependency_links.txt +1 -0
- narsche-0.1.0/narsche.egg-info/requires.txt +3 -0
- narsche-0.1.0/narsche.egg-info/top_level.txt +3 -0
- narsche-0.1.0/pyproject.toml +20 -0
- narsche-0.1.0/setup.cfg +4 -0
- narsche-0.1.0/tests/test_narsche.py +103 -0
narsche-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [year] [fullname]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
narsche-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: narsche
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Computing schematicity of autobiographical narratives
|
|
5
|
+
Author-email: Isaac Kinley <isaac.kinley@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: networkx
|
|
11
|
+
Requires-Dist: spacy
|
|
12
|
+
Requires-Dist: wordfreq
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Measuring narrative schematicity
|
|
17
|
+
|
|
18
|
+
[](https://codecov.io/github/kinleyid/narsche)
|
|
19
|
+
|
|
20
|
+
Methods from the paper "Computational Tools for Quantifying Schemas in Autobiographical Narratives".
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
[Under construction until this project is on PyPI]
|
|
25
|
+
|
|
26
|
+
Depends on `networkx` (for network models), `SpaCy` (for tokenization), and `wordfreq` for automated topic identification. Additionally, one of `SpaCy`'s models must be downloaded (see their instructions on how to do this).
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
### Loading and saving models
|
|
31
|
+
|
|
32
|
+
A text file of word vectors can be read using the `read_vectors()` function:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
vec_mod = narsche.read_vectors('/path/to/vectors.txt')
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
This produces a vector model. Initializing a network model requires first loading a `networkx.Graph` object:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import networkx as nx
|
|
42
|
+
|
|
43
|
+
graph = nx.load('/path/to/graph')
|
|
44
|
+
net_mod = narsche.NetworkModel(graph)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Models can be saved using the `save()` method and loaded using the `load()` class method:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
net_mod.save('network.mod')
|
|
51
|
+
net_mod = narsche.NetworkModel.load('network.mod')
|
|
52
|
+
|
|
53
|
+
vec_mod.save('vector.mod')
|
|
54
|
+
vec_mod = narsche.VectorModel.load('vector.mod')
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
These are just wrappers around `pickle.[load/dump]`. Any extension can be used.
|
|
58
|
+
|
|
59
|
+
### Tokenizing narratives
|
|
60
|
+
|
|
61
|
+
Before schematicity can be computed, narratives must be tokenized, i.e., converted to a list of tokens. For this, there is a `Tokenizer()` class that relies on `SpaCy`:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
txt = 'I sat on the sofa in my living room with a lamp' # Example text
|
|
65
|
+
tokenizer = narsche.Tokenizer('en_core_web_sm') # Initialize tokenizer
|
|
66
|
+
words = tokenizer.tokenize(txt) # Tokenize words
|
|
67
|
+
words = vec_mod.keep_known(words) # Use only those words that are in the model
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Computing schematicity
|
|
71
|
+
|
|
72
|
+
Given a model and a set of tokens (and possibly a topic word), schematicity can be computed using the `schematicity()` function:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
topic = narsche.identify_topic(words) # Identify the topic
|
|
76
|
+
# Compute schematicity
|
|
77
|
+
narsche.schematicity(
|
|
78
|
+
words=words,
|
|
79
|
+
model=mod,
|
|
80
|
+
method='on-topic-ppn', # or topic-relatedness, pairwise-relatedness, or component-size
|
|
81
|
+
topic=topic)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
See the documentation of the `schematicity()` function for kewords required by other methods.
|
narsche-0.1.0/README.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
|
|
2
|
+
# Measuring narrative schematicity
|
|
3
|
+
|
|
4
|
+
[](https://codecov.io/github/kinleyid/narsche)
|
|
5
|
+
|
|
6
|
+
Methods from the paper "Computational Tools for Quantifying Schemas in Autobiographical Narratives".
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
|
|
10
|
+
[Under construction until this project is on PyPI]
|
|
11
|
+
|
|
12
|
+
Depends on `networkx` (for network models), `SpaCy` (for tokenization), and `wordfreq` for automated topic identification. Additionally, one of `SpaCy`'s models must be downloaded (see their instructions on how to do this).
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
### Loading and saving models
|
|
17
|
+
|
|
18
|
+
A text file of word vectors can be read using the `read_vectors()` function:
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
vec_mod = narsche.read_vectors('/path/to/vectors.txt')
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
This produces a vector model. Initializing a network model requires first loading a `networkx.Graph` object:
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import networkx as nx
|
|
28
|
+
|
|
29
|
+
graph = nx.load('/path/to/graph')
|
|
30
|
+
net_mod = narsche.NetworkModel(graph)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Models can be saved using the `save()` method and loaded using the `load()` class method:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
net_mod.save('network.mod')
|
|
37
|
+
net_mod = narsche.NetworkModel.load('network.mod')
|
|
38
|
+
|
|
39
|
+
vec_mod.save('vector.mod')
|
|
40
|
+
vec_mod = narsche.VectorModel.load('vector.mod')
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
These are just wrappers around `pickle.[load/dump]`. Any extension can be used.
|
|
44
|
+
|
|
45
|
+
### Tokenizing narratives
|
|
46
|
+
|
|
47
|
+
Before schematicity can be computed, narratives must be tokenized, i.e., converted to a list of tokens. For this, there is a `Tokenizer()` class that relies on `SpaCy`:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
txt = 'I sat on the sofa in my living room with a lamp' # Example text
|
|
51
|
+
tokenizer = narsche.Tokenizer('en_core_web_sm') # Initialize tokenizer
|
|
52
|
+
words = tokenizer.tokenize(txt) # Tokenize words
|
|
53
|
+
words = vec_mod.keep_known(words) # Use only those words that are in the model
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Computing schematicity
|
|
57
|
+
|
|
58
|
+
Given a model and a set of tokens (and possibly a topic word), schematicity can be computed using the `schematicity()` function:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
topic = narsche.identify_topic(words) # Identify the topic
|
|
62
|
+
# Compute schematicity
|
|
63
|
+
narsche.schematicity(
|
|
64
|
+
words=words,
|
|
65
|
+
model=mod,
|
|
66
|
+
method='on-topic-ppn', # or topic-relatedness, pairwise-relatedness, or component-size
|
|
67
|
+
topic=topic)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
See the documentation of the `schematicity()` function for kewords required by other methods.
|
|
@@ -0,0 +1,465 @@
|
|
|
1
|
+
import pickle
|
|
2
|
+
import numpy as np
|
|
3
|
+
from wordfreq import word_frequency
|
|
4
|
+
from collections import Counter
|
|
5
|
+
import networkx as nx
|
|
6
|
+
import spacy
|
|
7
|
+
import sys
|
|
8
|
+
from pdb import set_trace
|
|
9
|
+
|
|
10
|
+
epsilon = sys.float_info.epsilon
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def get_pairs(iterable):
|
|
14
|
+
"""
|
|
15
|
+
Gets adjacent pairs of items from some iterable collection.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
iterable (iterable): some iterable collection (e.g., a list or tuple)
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
pairs: adjacent pairs or items in a list
|
|
22
|
+
"""
|
|
23
|
+
pairs = [(a, b) for i, a in enumerate(iterable) for b in iterable[(i + 1) :]]
|
|
24
|
+
return pairs
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def identify_topic(words):
|
|
28
|
+
"""
|
|
29
|
+
Identifies the topic using tf-idf
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
words (list): a list of words
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
topic (str): the topic word
|
|
36
|
+
"""
|
|
37
|
+
bag = np.array(sorted(list(set(words)))) # Unique words
|
|
38
|
+
|
|
39
|
+
"""
|
|
40
|
+
# Compute tf-idf
|
|
41
|
+
tf = np.array([words.count(word) for word in bag])
|
|
42
|
+
df = np.array([word_frequency(word=word, lang='en') for word in bag])
|
|
43
|
+
idf = np.log(1 / df)
|
|
44
|
+
tf_idf = tf*idf
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
tf = Counter(words)
|
|
48
|
+
# Compute doc frequency (could be 0)
|
|
49
|
+
df = {word: word_frequency(word=word, lang="en", minimum=epsilon) for word in bag}
|
|
50
|
+
# Recompute bag
|
|
51
|
+
bag = np.array([word for word in df if df[word] > 0])
|
|
52
|
+
idf = {word: np.log(1 / df[word]) for word in bag}
|
|
53
|
+
tf_idf = np.array([tf[word] * idf[word] for word in bag])
|
|
54
|
+
# Sort
|
|
55
|
+
sort_idx = np.argsort(-tf_idf) # Negative to be in descending
|
|
56
|
+
sorted_bag = bag[sort_idx]
|
|
57
|
+
# Get topic (top ranked)
|
|
58
|
+
return str(sorted_bag[0])
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def read_vectors(file, encoding="utf-8"):
|
|
62
|
+
"""
|
|
63
|
+
Reads word vectors from a text file. Each line of the file should be formatted <word> <dim1> <dim2> <dim3> ... Vectors are automatically normalized
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
file (str): name of text file
|
|
67
|
+
encoding (str): encoding used when reading the file
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
model (VectorModel): vectors in a VectorModel
|
|
71
|
+
"""
|
|
72
|
+
words = []
|
|
73
|
+
vectors = []
|
|
74
|
+
with open(file, "r", encoding=encoding) as f:
|
|
75
|
+
for line in f:
|
|
76
|
+
# First item in space-delimited line is token, remaining items are vector elements
|
|
77
|
+
split_line = line.rstrip("\n").split(" ")
|
|
78
|
+
words.append(split_line[0])
|
|
79
|
+
# Normalize vector for fast dot product-based cosine similarity computation
|
|
80
|
+
vector = np.asarray(split_line[1:]).astype(np.float32)
|
|
81
|
+
# vector /= np.linalg.norm(vector)
|
|
82
|
+
vectors.append(vector)
|
|
83
|
+
vectors = np.array(vectors)
|
|
84
|
+
# Normalize
|
|
85
|
+
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
86
|
+
vectors /= norms
|
|
87
|
+
return VectorModel(words, vectors)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class Model:
|
|
91
|
+
def save(self, path):
|
|
92
|
+
with open(path, "wb") as f:
|
|
93
|
+
pickle.dump(self, f)
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def load(cls, path):
|
|
97
|
+
with open(path, "rb") as f:
|
|
98
|
+
obj = pickle.load(f)
|
|
99
|
+
if not isinstance(obj, cls):
|
|
100
|
+
raise TypeError(
|
|
101
|
+
f"Expected instance of %s, got %s" % (cls.__name__, type(obj).__name__)
|
|
102
|
+
)
|
|
103
|
+
return obj
|
|
104
|
+
|
|
105
|
+
def keep_known(self, words):
|
|
106
|
+
return [word for word in words if word in self]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class VectorModel(Model):
|
|
110
|
+
"""
|
|
111
|
+
Vector-based model
|
|
112
|
+
|
|
113
|
+
Attributes:
|
|
114
|
+
words (list of str): words for which the model has vectors
|
|
115
|
+
vectors (matrix): numpy matrix containing the vectors (in the same order as the word list)
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def __init__(self, words, vectors):
|
|
119
|
+
"""
|
|
120
|
+
Initializes vector model
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
words (list of str): words in model
|
|
124
|
+
vectors (numpy array): array of corresponding vectors
|
|
125
|
+
"""
|
|
126
|
+
if isinstance(words, list):
|
|
127
|
+
if not all(isinstance(word, str) for word in words):
|
|
128
|
+
raise ValueError("words is not a list of strings")
|
|
129
|
+
if not isinstance(vectors, np.ndarray):
|
|
130
|
+
raise ValueError("vectors is not an np.ndarray")
|
|
131
|
+
if len(words) != len(vectors):
|
|
132
|
+
raise ValueError("different numbers of words and vectors")
|
|
133
|
+
# Store efficiently---list of words, matrix of vectors, and index
|
|
134
|
+
self.words = words
|
|
135
|
+
self.vectors = vectors
|
|
136
|
+
|
|
137
|
+
def __contains__(self, word):
|
|
138
|
+
return word in self.words
|
|
139
|
+
|
|
140
|
+
def in_model(self, word):
|
|
141
|
+
return word in self.words
|
|
142
|
+
|
|
143
|
+
def compute_sim(self, word1, word2):
|
|
144
|
+
"""
|
|
145
|
+
Compute cosine similarity between words
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
word1 (str): first word
|
|
149
|
+
word2 (str): second word
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
sim (float): cosine similarity (nan if either word is not in the model)
|
|
153
|
+
"""
|
|
154
|
+
# Compute similarity
|
|
155
|
+
if word1 in self.words and word2 in self.words:
|
|
156
|
+
i1, i2 = self.words.index(word1), self.words.index(word2)
|
|
157
|
+
v1, v2 = self.vectors[i1], self.vectors[i2]
|
|
158
|
+
sim = np.dot(v1, v2)
|
|
159
|
+
else:
|
|
160
|
+
sim = float("nan")
|
|
161
|
+
return sim
|
|
162
|
+
|
|
163
|
+
def get_lexicon(self, topic, top_n=10000, including_topic=True):
|
|
164
|
+
"""
|
|
165
|
+
Get "lexicon" of words most related to a topic
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
topic (str): topic word
|
|
169
|
+
top_n (int): size of lexicon
|
|
170
|
+
including_topic (bool): should the topic word be included in the lexicon?
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
lexicon (list of str): list of words most related to the topic
|
|
174
|
+
"""
|
|
175
|
+
# Get lexicon of words most related to <topic>
|
|
176
|
+
|
|
177
|
+
# First compute similarities (faster than constructing new matrix not including topic)
|
|
178
|
+
topic_vector = self.vectors[self.words.index(topic)]
|
|
179
|
+
similarities = np.matmul(self.vectors, topic_vector)
|
|
180
|
+
# Sort by similarity
|
|
181
|
+
sort_idx = np.argsort(similarities)
|
|
182
|
+
sorted_words = [self.words[i] for i in sort_idx]
|
|
183
|
+
# Remove topic word itself?
|
|
184
|
+
if not including_topic:
|
|
185
|
+
sorted_words.pop(sorted_words.index(topic))
|
|
186
|
+
# Pare down
|
|
187
|
+
lexicon = sorted_words[-top_n:]
|
|
188
|
+
return lexicon
|
|
189
|
+
|
|
190
|
+
def as_graph(self, threshold, words=None):
|
|
191
|
+
"""
|
|
192
|
+
Convert vector model to network model
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
threshold (float): only pairs of words whose cosine similarity is greater than or equal to this threshold will share an edge in the resulting network
|
|
196
|
+
words (list of str): for speed, only this subset of words will be used to produce the network (rather than all words in the vector-based model)
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
model (NetworkModel): graph-based model
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
# Get only those tokens that are actually in current dictionary
|
|
203
|
+
if words != None:
|
|
204
|
+
words = [w for w in words if w in self.words]
|
|
205
|
+
else:
|
|
206
|
+
words = self.words
|
|
207
|
+
pairs = get_pairs(words)
|
|
208
|
+
graph = nx.Graph()
|
|
209
|
+
edges = []
|
|
210
|
+
for word1, word2 in pairs:
|
|
211
|
+
sim = self.compute_sim(word1, word2)
|
|
212
|
+
if sim >= threshold:
|
|
213
|
+
graph.add_edge(word1, word2, strength=sim)
|
|
214
|
+
# Create network model
|
|
215
|
+
return NetworkModel(graph)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class NetworkModel(Model):
|
|
219
|
+
"""
|
|
220
|
+
Network-based model
|
|
221
|
+
|
|
222
|
+
Attributes:
|
|
223
|
+
graph (networkx.Graph): graph of words
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
def __init__(self, graph):
|
|
227
|
+
"""
|
|
228
|
+
Initializes vector model
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
graph (networkx.Graph): network of words whose edges include a "strength" attribute
|
|
232
|
+
"""
|
|
233
|
+
if not isinstance(graph, nx.Graph):
|
|
234
|
+
raise TypeError(f"Expected a networkx.Graph, got %s" % type(graph).__name__)
|
|
235
|
+
for u, v, data in graph.edges(data=True):
|
|
236
|
+
if "strength" not in data:
|
|
237
|
+
raise ValueError(
|
|
238
|
+
f"Edge (%s, %s) is missing 'strength' attribute" % (u, v)
|
|
239
|
+
)
|
|
240
|
+
# Compute inverse strength
|
|
241
|
+
inv_strength = {
|
|
242
|
+
(a, b): 1 / data["strength"] for a, b, data in graph.edges(data=True)
|
|
243
|
+
}
|
|
244
|
+
nx.set_edge_attributes(graph, inv_strength, "inv_strength")
|
|
245
|
+
self.graph = graph
|
|
246
|
+
|
|
247
|
+
def __contains__(self, word):
|
|
248
|
+
return word in self.graph
|
|
249
|
+
|
|
250
|
+
def in_model(self, word):
|
|
251
|
+
return word in self.graph
|
|
252
|
+
|
|
253
|
+
def compute_sim(self, word1, word2):
|
|
254
|
+
"""
|
|
255
|
+
Compute efficiency-based similarity (i.e., the length of the shortest path between words)
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
word1 (str): first word
|
|
259
|
+
word2 (str): second word
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
efficiency (float): efficiency-based similarity measure
|
|
263
|
+
"""
|
|
264
|
+
# Compute similarity by local efficiency metric
|
|
265
|
+
|
|
266
|
+
if word1 in self.graph and word2 in self.graph:
|
|
267
|
+
try:
|
|
268
|
+
distance, path = nx.bidirectional_dijkstra(
|
|
269
|
+
self.graph, word1, word2, weight="inv_strength"
|
|
270
|
+
)
|
|
271
|
+
efficiency = 1 / distance
|
|
272
|
+
except:
|
|
273
|
+
# No path between nodes
|
|
274
|
+
efficiency = 0
|
|
275
|
+
else:
|
|
276
|
+
efficiency = float("nan")
|
|
277
|
+
return efficiency
|
|
278
|
+
|
|
279
|
+
def get_lexicon(self, topic, max_steps=2, including_topic=True):
|
|
280
|
+
"""
|
|
281
|
+
Get "lexicon" of words most related to a topic. This function is a wrapper around networkx.ego_graph()
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
topic (str): topic word
|
|
285
|
+
max_steps (int): number of steps to traverse to identify related words
|
|
286
|
+
including_topic (bool): should the topic word be included in the lexicon?
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
lexicon (list of str): list of words most related to the topic
|
|
290
|
+
"""
|
|
291
|
+
ego_graph = nx.ego_graph(
|
|
292
|
+
self.graph, n=topic, radius=max_steps, center=including_topic, distance=None
|
|
293
|
+
) # Make sure this binarizes the strengths
|
|
294
|
+
lexicon = [w for w in ego_graph]
|
|
295
|
+
return lexicon
|
|
296
|
+
|
|
297
|
+
def largest_component(self, words):
|
|
298
|
+
"""
|
|
299
|
+
Get largest network component in the subgraph induced by words
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
words (list of str): words by which to induce subgraph
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
component (networkx.Graph): largest network component
|
|
306
|
+
"""
|
|
307
|
+
subgraph = self.graph.subgraph(words)
|
|
308
|
+
components = nx.connected_components(subgraph)
|
|
309
|
+
components_by_size = list(sorted(components, key=len, reverse=True))
|
|
310
|
+
if len(components_by_size) == 0:
|
|
311
|
+
# No component
|
|
312
|
+
largest_component = nx.Graph()
|
|
313
|
+
else:
|
|
314
|
+
# Get words in largest component
|
|
315
|
+
largest_component = components_by_size[0]
|
|
316
|
+
return largest_component
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class Tokenizer:
|
|
320
|
+
"""
|
|
321
|
+
SpaCy-based tokenizer
|
|
322
|
+
|
|
323
|
+
Attributes:
|
|
324
|
+
nlp (spacy model): SpaCy model used to tokenize
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
def __init__(self, spacy_model="en_core_web_sm"):
|
|
328
|
+
self.nlp = spacy.load(spacy_model)
|
|
329
|
+
|
|
330
|
+
def _lemmatize_token(self, token):
|
|
331
|
+
return token.lemma_.lower()
|
|
332
|
+
|
|
333
|
+
def _lemmatize(self, text):
|
|
334
|
+
doc = self.nlp(text)
|
|
335
|
+
return [self._lemmatize_token(tok) for tok in doc]
|
|
336
|
+
|
|
337
|
+
def _is_content(self, tok):
|
|
338
|
+
return tok.pos_ in ("NOUN", "VERB", "ADJ", "ADV")
|
|
339
|
+
|
|
340
|
+
def tokenize(
|
|
341
|
+
self, text, rm_stops=True, only_content=True, lemmatize=False, lowercase=True
|
|
342
|
+
):
|
|
343
|
+
"""
|
|
344
|
+
Tokenize text (lowercase and keep only non-stop content words)
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
text (str): text to be tokenized
|
|
348
|
+
rm_stops (bool): remove stopwords
|
|
349
|
+
only_content (bool): keep only content words (nounds, verbs, adjectives, adverbs)
|
|
350
|
+
lemmatize (bool): lemmatize tokens
|
|
351
|
+
lowercase (bool): convert words to lowercase
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
tokens (list of str): list of tokens
|
|
355
|
+
"""
|
|
356
|
+
doc = self.nlp(text)
|
|
357
|
+
tokenized = []
|
|
358
|
+
for tok in doc:
|
|
359
|
+
|
|
360
|
+
keep_word = True
|
|
361
|
+
|
|
362
|
+
if rm_stops and tok.is_stop:
|
|
363
|
+
keep_word = False
|
|
364
|
+
|
|
365
|
+
if only_content and tok.pos_ not in ("NOUN", "VERB", "ADJ", "ADV"):
|
|
366
|
+
keep_word = False
|
|
367
|
+
|
|
368
|
+
if keep_word:
|
|
369
|
+
|
|
370
|
+
if lemmatize:
|
|
371
|
+
word = tok.lemma_
|
|
372
|
+
else:
|
|
373
|
+
word = tok.text
|
|
374
|
+
|
|
375
|
+
if lowercase:
|
|
376
|
+
word = word.lower()
|
|
377
|
+
|
|
378
|
+
tokenized.append(word)
|
|
379
|
+
|
|
380
|
+
return tokenized
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def schematicity(words, model, method, topic=None, pairs=None, lexsize=None):
|
|
384
|
+
"""
|
|
385
|
+
Compute schematicity
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
words (list of str): tokens from a narrative
|
|
389
|
+
model (VectorModel or NetworkModel): model to use for computing schematicity
|
|
390
|
+
method (str): method of computing schematicity ('on-topic-ppn', 'topic-relatedness', 'pairwise-relatedness', or 'component-size')
|
|
391
|
+
topic (str): topic word for topic-based methods
|
|
392
|
+
pairs (str): for pairwise-relatedness, which pairs should be used ('all' for all pairs, 'adj' for bigrams/adjacent pairs)
|
|
393
|
+
lexsize (int): for on-topic-ppn, this parameter is passed to the .get_lexicon() method of the model
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
schem (float): schematicity measure
|
|
397
|
+
"""
|
|
398
|
+
|
|
399
|
+
# Validation
|
|
400
|
+
if type(words) is not list:
|
|
401
|
+
raise ValueError("words must be a list")
|
|
402
|
+
if len(words) == 0:
|
|
403
|
+
raise ValueError("words is empty")
|
|
404
|
+
if not all(type(word) is str for word in words):
|
|
405
|
+
raise ValueError("all words must be strings")
|
|
406
|
+
valid_methods = [
|
|
407
|
+
"on-topic-ppn",
|
|
408
|
+
"topic-relatedness",
|
|
409
|
+
"pairwise-relatedness",
|
|
410
|
+
"component-size",
|
|
411
|
+
]
|
|
412
|
+
if method not in valid_methods:
|
|
413
|
+
raise ValueError("method must be one of %s" % valid_methods)
|
|
414
|
+
|
|
415
|
+
if method in ["on-topic-ppn", "topic-relatedness"]:
|
|
416
|
+
if topic == None:
|
|
417
|
+
raise ValueError('topic must be specified for method "%s"' % method)
|
|
418
|
+
elif topic not in model:
|
|
419
|
+
raise ValueError('topic "%s" is not in model' % topic)
|
|
420
|
+
elif method == "pairwise-relatedness":
|
|
421
|
+
if pairs not in ["all", "adj"]:
|
|
422
|
+
raise ValueError(
|
|
423
|
+
'pairs must be one of "all", "adj" for method "pairwise-relatedness"'
|
|
424
|
+
)
|
|
425
|
+
elif method == "component-size":
|
|
426
|
+
if not isinstance(model, NetworkModel):
|
|
427
|
+
raise ValueError('model must be a NetworkModel for method "component-size"')
|
|
428
|
+
|
|
429
|
+
if not all(word in model for word in words):
|
|
430
|
+
raise ValueError(
|
|
431
|
+
"not all words are in model. Use .keep_known() to filter out words not in the model"
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
if method == "on-topic-ppn":
|
|
435
|
+
if isinstance(model, VectorModel):
|
|
436
|
+
kwargs = {} if lexsize == None else {"top_n": lexsize}
|
|
437
|
+
elif isinstance(model, NetworkModel):
|
|
438
|
+
kwargs = {} if lexsize == None else {"max_steps": lexsize}
|
|
439
|
+
else:
|
|
440
|
+
raise ValueError("model must be a VectorModel or NetworkModel")
|
|
441
|
+
lexicon = model.get_lexicon(topic, **kwargs)
|
|
442
|
+
on_topic = [t in lexicon for t in words]
|
|
443
|
+
ppn_on_topic = np.mean(on_topic)
|
|
444
|
+
return ppn_on_topic
|
|
445
|
+
|
|
446
|
+
elif method == "topic-relatedness":
|
|
447
|
+
sims = [model.compute_sim(word, topic) for word in words]
|
|
448
|
+
return np.mean(sims)
|
|
449
|
+
|
|
450
|
+
elif method == "pairwise-relatedness":
|
|
451
|
+
# Get word pairs
|
|
452
|
+
if pairs == "all":
|
|
453
|
+
word_pairs = get_pairs(words)
|
|
454
|
+
elif pairs in ["adj", "adjacent"]:
|
|
455
|
+
word_pairs = list(zip(words[:-1], words[1:]))
|
|
456
|
+
else:
|
|
457
|
+
raise ValueError('unrecognized pairs option "%s"' % pairs)
|
|
458
|
+
# Compute average pairwise similarity
|
|
459
|
+
sims = [model.compute_sim(*word_pair) for word_pair in word_pairs]
|
|
460
|
+
return np.mean(sims)
|
|
461
|
+
|
|
462
|
+
elif method == "component-size":
|
|
463
|
+
# Get largest fully-connected component
|
|
464
|
+
largest_component = model.largest_component(words)
|
|
465
|
+
return len(largest_component) / len(words)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: narsche
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Computing schematicity of autobiographical narratives
|
|
5
|
+
Author-email: Isaac Kinley <isaac.kinley@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: networkx
|
|
11
|
+
Requires-Dist: spacy
|
|
12
|
+
Requires-Dist: wordfreq
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Measuring narrative schematicity
|
|
17
|
+
|
|
18
|
+
[](https://codecov.io/github/kinleyid/narsche)
|
|
19
|
+
|
|
20
|
+
Methods from the paper "Computational Tools for Quantifying Schemas in Autobiographical Narratives".
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
[Under construction until this project is on PyPI]
|
|
25
|
+
|
|
26
|
+
Depends on `networkx` (for network models), `SpaCy` (for tokenization), and `wordfreq` for automated topic identification. Additionally, one of `SpaCy`'s models must be downloaded (see their instructions on how to do this).
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
### Loading and saving models
|
|
31
|
+
|
|
32
|
+
A text file of word vectors can be read using the `read_vectors()` function:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
vec_mod = narsche.read_vectors('/path/to/vectors.txt')
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
This produces a vector model. Initializing a network model requires first loading a `networkx.Graph` object:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import networkx as nx
|
|
42
|
+
|
|
43
|
+
graph = nx.load('/path/to/graph')
|
|
44
|
+
net_mod = narsche.NetworkModel(graph)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Models can be saved using the `save()` method and loaded using the `load()` class method:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
net_mod.save('network.mod')
|
|
51
|
+
net_mod = narsche.NetworkModel.load('network.mod')
|
|
52
|
+
|
|
53
|
+
vec_mod.save('vector.mod')
|
|
54
|
+
vec_mod = narsche.VectorModel.load('vector.mod')
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
These are just wrappers around `pickle.[load/dump]`. Any extension can be used.
|
|
58
|
+
|
|
59
|
+
### Tokenizing narratives
|
|
60
|
+
|
|
61
|
+
Before schematicity can be computed, narratives must be tokenized, i.e., converted to a list of tokens. For this, there is a `Tokenizer()` class that relies on `SpaCy`:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
txt = 'I sat on the sofa in my living room with a lamp' # Example text
|
|
65
|
+
tokenizer = narsche.Tokenizer('en_core_web_sm') # Initialize tokenizer
|
|
66
|
+
words = tokenizer.tokenize(txt) # Tokenize words
|
|
67
|
+
words = vec_mod.keep_known(words) # Use only those words that are in the model
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Computing schematicity
|
|
71
|
+
|
|
72
|
+
Given a model and a set of tokens (and possibly a topic word), schematicity can be computed using the `schematicity()` function:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
topic = narsche.identify_topic(words) # Identify the topic
|
|
76
|
+
# Compute schematicity
|
|
77
|
+
narsche.schematicity(
|
|
78
|
+
words=words,
|
|
79
|
+
model=mod,
|
|
80
|
+
method='on-topic-ppn', # or topic-relatedness, pairwise-relatedness, or component-size
|
|
81
|
+
topic=topic)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
See the documentation of the `schematicity()` function for kewords required by other methods.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "narsche"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Computing schematicity of autobiographical narratives"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name="Isaac Kinley", email="isaac.kinley@gmail.com" }
|
|
11
|
+
]
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = "MIT"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"networkx", "spacy", "wordfreq"
|
|
16
|
+
]
|
|
17
|
+
requires-python = ">=3.9"
|
|
18
|
+
|
|
19
|
+
[tool.setuptools.packages.find]
|
|
20
|
+
where = ["."]
|
narsche-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import narsche
|
|
3
|
+
import os
|
|
4
|
+
from pdb import set_trace
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.fixture
|
|
8
|
+
def example_words():
|
|
9
|
+
return ["sitting", "lamp", "desk", "office"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.fixture
|
|
13
|
+
def cur_dir():
|
|
14
|
+
test_dir = os.path.dirname(__file__)
|
|
15
|
+
return test_dir
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.fixture
|
|
19
|
+
def vector_mod(cur_dir):
|
|
20
|
+
sample_vec_file = os.path.join(cur_dir, "sample-vectors.txt")
|
|
21
|
+
mod = narsche.read_vectors(sample_vec_file)
|
|
22
|
+
return mod
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture
|
|
26
|
+
def network_mod(vector_mod):
|
|
27
|
+
return vector_mod.as_graph(threshold=0.9)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_topic_identification():
|
|
31
|
+
narsche.identify_topic(["chair", "sofa", "living", "room", "wall", "picture"])
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_read_vector(vector_mod):
|
|
35
|
+
assert isinstance(vector_mod, narsche.VectorModel)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_save_vector_model(vector_mod, cur_dir):
|
|
39
|
+
vector_mod.save(os.path.join(cur_dir, "vector-model.mod"))
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_load_vector_model(cur_dir):
|
|
43
|
+
narsche.VectorModel.load(os.path.join(cur_dir, "vector-model.mod"))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_vector_model_methods(vector_mod):
|
|
47
|
+
assert "lamp" in vector_mod
|
|
48
|
+
vector_mod.compute_sim("lamp", "desk")
|
|
49
|
+
vector_mod.get_lexicon("lamp", top_n=2, including_topic=True)
|
|
50
|
+
vector_mod.get_lexicon("lamp", top_n=2, including_topic=False)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_as_graph(vector_mod, network_mod):
|
|
54
|
+
assert isinstance(network_mod, narsche.NetworkModel)
|
|
55
|
+
vector_mod.as_graph(words=["lamp", "desk", "pottery"], threshold=0.3)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_save_network_model(network_mod, cur_dir):
|
|
59
|
+
network_mod.save(os.path.join(cur_dir, "network-model.mod"))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_load_network_model(cur_dir):
|
|
63
|
+
narsche.NetworkModel.load(os.path.join(cur_dir, "network-model.mod"))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_network_model_methods(network_mod):
|
|
67
|
+
assert "lamp" in network_mod
|
|
68
|
+
network_mod.compute_sim("lamp", "desk")
|
|
69
|
+
network_mod.get_lexicon("lamp", max_steps=1, including_topic=True)
|
|
70
|
+
network_mod.get_lexicon("lamp", max_steps=1, including_topic=False)
|
|
71
|
+
network_mod.largest_component(["lamp", "desk"])
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_tokenizer():
|
|
75
|
+
tokenizer = narsche.Tokenizer()
|
|
76
|
+
tokenizer.tokenize("This is a short piece of text")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_schematicity_vector_model(vector_mod, example_words):
|
|
80
|
+
words = vector_mod.keep_known(example_words)
|
|
81
|
+
narsche.schematicity(
|
|
82
|
+
model=vector_mod, words=words, method="on-topic-ppn", topic="lamp"
|
|
83
|
+
)
|
|
84
|
+
narsche.schematicity(
|
|
85
|
+
model=vector_mod, words=words, method="topic-relatedness", topic="lamp"
|
|
86
|
+
)
|
|
87
|
+
narsche.schematicity(
|
|
88
|
+
model=vector_mod, words=words, method="pairwise-relatedness", pairs="adj"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def test_schematicity_network_model(network_mod, example_words):
|
|
93
|
+
words = network_mod.keep_known(example_words)
|
|
94
|
+
narsche.schematicity(
|
|
95
|
+
model=network_mod, words=words, method="on-topic-ppn", topic="lamp"
|
|
96
|
+
)
|
|
97
|
+
narsche.schematicity(
|
|
98
|
+
model=network_mod, words=words, method="topic-relatedness", topic="lamp"
|
|
99
|
+
)
|
|
100
|
+
narsche.schematicity(
|
|
101
|
+
model=network_mod, words=words, method="pairwise-relatedness", pairs="adj"
|
|
102
|
+
)
|
|
103
|
+
narsche.schematicity(model=network_mod, words=words, method="component-size")
|