pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +1 -2
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1224 -2
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +4 -0
- pystylometry/authorship/additional_methods.py +100 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +301 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +641 -0
- pystylometry/lexical/function_words.py +391 -0
- pystylometry/lexical/hapax.py +154 -7
- pystylometry/lexical/mtld.py +83 -7
- pystylometry/lexical/ttr.py +83 -0
- pystylometry/lexical/word_frequency_sophistication.py +581 -0
- pystylometry/lexical/yule.py +34 -7
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +985 -0
- pystylometry/readability/ari.py +93 -17
- pystylometry/readability/coleman_liau.py +102 -9
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +59 -14
- pystylometry/readability/gunning_fog.py +194 -25
- pystylometry/readability/smog.py +31 -14
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +432 -0
- pystylometry/syntactic/pos_ratios.py +104 -13
- pystylometry/syntactic/sentence_stats.py +57 -13
- pystylometry/syntactic/sentence_types.py +470 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
- pystylometry-1.0.0.dist-info/RECORD +46 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
- pystylometry-0.1.0.dist-info/RECORD +0 -26
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Stylistic markers for authorship attribution.
|
|
2
|
+
|
|
3
|
+
This module identifies and analyzes specific linguistic features that authors
|
|
4
|
+
use consistently and often subconsciously. These markers include contraction
|
|
5
|
+
preferences, intensifier usage, hedging patterns, modal auxiliaries, negation
|
|
6
|
+
patterns, and punctuation style habits.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issue:
|
|
9
|
+
#20 - Stylistic Markers
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/20
|
|
11
|
+
|
|
12
|
+
Categories of stylistic markers:
|
|
13
|
+
- Contraction patterns (can't vs. cannot, I'm vs. I am)
|
|
14
|
+
- Intensifiers (very, really, extremely, quite)
|
|
15
|
+
- Hedges (maybe, perhaps, probably, somewhat)
|
|
16
|
+
- Modal auxiliaries (can, could, may, might, must, should, will, would)
|
|
17
|
+
- Negation patterns (not, no, never, none, neither)
|
|
18
|
+
- Punctuation style (exclamations, questions, quotes, parentheticals)
|
|
19
|
+
|
|
20
|
+
References:
|
|
21
|
+
Argamon, S., & Levitan, S. (2005). Measuring the usefulness of function
|
|
22
|
+
words for authorship attribution. ACH/ALLC.
|
|
23
|
+
Pennebaker, J. W. (2011). The secret life of pronouns. Bloomsbury Press.
|
|
24
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from .._types import StylisticMarkersResult
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def compute_stylistic_markers(text: str) -> StylisticMarkersResult:
|
|
31
|
+
"""
|
|
32
|
+
Analyze stylistic markers for authorship attribution.
|
|
33
|
+
|
|
34
|
+
Identifies and quantifies specific linguistic features that reveal authorial
|
|
35
|
+
style. These features are often used subconsciously and remain consistent
|
|
36
|
+
across an author's works, making them valuable for attribution.
|
|
37
|
+
|
|
38
|
+
Related GitHub Issue:
|
|
39
|
+
#20 - Stylistic Markers
|
|
40
|
+
https://github.com/craigtrim/pystylometry/issues/20
|
|
41
|
+
|
|
42
|
+
Why stylistic markers matter:
|
|
43
|
+
|
|
44
|
+
Subconscious usage:
|
|
45
|
+
- Authors don't deliberately vary these features
|
|
46
|
+
- Remain consistent even when author tries to disguise style
|
|
47
|
+
- Difficult to consciously control
|
|
48
|
+
|
|
49
|
+
Genre-independent:
|
|
50
|
+
- Used similarly across different topics
|
|
51
|
+
- More stable than content words
|
|
52
|
+
- Complement content-based features
|
|
53
|
+
|
|
54
|
+
Psychologically meaningful:
|
|
55
|
+
- Reveal personality traits (Pennebaker's research)
|
|
56
|
+
- Indicate emotional state
|
|
57
|
+
- Show cognitive patterns
|
|
58
|
+
|
|
59
|
+
Marker Categories Analyzed:
|
|
60
|
+
|
|
61
|
+
1. Contractions:
|
|
62
|
+
- Preference for contracted vs. expanded forms
|
|
63
|
+
- Examples: can't/cannot, I'm/I am, won't/will not
|
|
64
|
+
- Formality indicator (more contractions = informal)
|
|
65
|
+
|
|
66
|
+
2. Intensifiers:
|
|
67
|
+
- Words that amplify meaning
|
|
68
|
+
- Examples: very, really, extremely, quite, rather
|
|
69
|
+
- Indicate emphatic style
|
|
70
|
+
|
|
71
|
+
3. Hedges:
|
|
72
|
+
- Words that weaken or qualify statements
|
|
73
|
+
- Examples: maybe, perhaps, probably, somewhat, kind of
|
|
74
|
+
- Indicate tentative or cautious style
|
|
75
|
+
|
|
76
|
+
4. Modal Auxiliaries:
|
|
77
|
+
- Express necessity, possibility, permission
|
|
78
|
+
- Epistemic modals: may, might, could (possibility)
|
|
79
|
+
- Deontic modals: must, should, ought (obligation)
|
|
80
|
+
|
|
81
|
+
5. Negation:
|
|
82
|
+
- Patterns of negative expression
|
|
83
|
+
- not, no, never, none, neither, nowhere
|
|
84
|
+
- Frequency and type vary by author
|
|
85
|
+
|
|
86
|
+
6. Punctuation Style:
|
|
87
|
+
- Exclamation marks: Emphatic, emotional
|
|
88
|
+
- Question marks: Interactive, rhetorical
|
|
89
|
+
- Quotation marks: Dialogue, scare quotes
|
|
90
|
+
- Parentheticals: Asides, additional info
|
|
91
|
+
- Ellipses: Trailing off, suspense
|
|
92
|
+
- Dashes: Interruptions, emphasis
|
|
93
|
+
- Semicolons/colons: Sophisticated syntax
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
text: Input text to analyze. Should contain at least 200+ words for
|
|
97
|
+
reliable statistics. Shorter texts may have unstable marker ratios.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
StylisticMarkersResult containing extensive marker statistics.
|
|
101
|
+
See _types.py for complete field list.
|
|
102
|
+
|
|
103
|
+
Example:
|
|
104
|
+
>>> result = compute_stylistic_markers("Sample text with markers...")
|
|
105
|
+
>>> print(f"Contraction ratio: {result.contraction_ratio * 100:.1f}%")
|
|
106
|
+
Contraction ratio: 42.3%
|
|
107
|
+
>>> print(f"Intensifiers/100 words: {result.intensifier_density:.2f}")
|
|
108
|
+
Intensifiers/100 words: 3.45
|
|
109
|
+
>>> print(f"Top intensifiers: {result.top_intensifiers[:3]}")
|
|
110
|
+
Top intensifiers: [('very', 12), ('really', 8), ('quite', 5)]
|
|
111
|
+
>>> print(f"Exclamation density: {result.exclamation_density:.2f}")
|
|
112
|
+
Exclamation density: 2.10
|
|
113
|
+
|
|
114
|
+
Note:
|
|
115
|
+
- Densities are per 100 words for interpretability
|
|
116
|
+
- Contraction detection requires pattern matching
|
|
117
|
+
- Modal auxiliaries classified as epistemic or deontic
|
|
118
|
+
- Punctuation counts include all occurrences
|
|
119
|
+
- Empty text returns NaN for ratios, 0 for counts
|
|
120
|
+
"""
|
|
121
|
+
# TODO: Implement stylistic marker analysis
|
|
122
|
+
# GitHub Issue #20: https://github.com/craigtrim/pystylometry/issues/20
|
|
123
|
+
#
|
|
124
|
+
# This is a comprehensive implementation with many components.
|
|
125
|
+
# Break it down into logical sections.
|
|
126
|
+
#
|
|
127
|
+
# See GitHub issue for full implementation plan and word lists.
|
|
128
|
+
raise NotImplementedError(
|
|
129
|
+
"Stylistic markers not yet implemented. "
|
|
130
|
+
"See GitHub Issue #20: https://github.com/craigtrim/pystylometry/issues/20"
|
|
131
|
+
)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Vocabulary overlap and similarity metrics.
|
|
2
|
+
|
|
3
|
+
This module computes similarity measures between two texts based on their
|
|
4
|
+
shared vocabulary. Useful for authorship verification, plagiarism detection,
|
|
5
|
+
and measuring stylistic consistency.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#21 - Vocabulary Overlap and Similarity Metrics
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/21
|
|
10
|
+
|
|
11
|
+
References:
|
|
12
|
+
Jaccard, P. (1912). The distribution of the flora in the alpine zone.
|
|
13
|
+
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information Retrieval.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from .._types import VocabularyOverlapResult
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def compute_vocabulary_overlap(text1: str, text2: str) -> VocabularyOverlapResult:
|
|
20
|
+
"""
|
|
21
|
+
Compute vocabulary overlap and similarity between two texts.
|
|
22
|
+
|
|
23
|
+
Related GitHub Issue:
|
|
24
|
+
#21 - Vocabulary Overlap and Similarity Metrics
|
|
25
|
+
https://github.com/craigtrim/pystylometry/issues/21
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text1: First text to compare
|
|
29
|
+
text2: Second text to compare
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
VocabularyOverlapResult with Jaccard, Dice, cosine similarities,
|
|
33
|
+
shared vocabulary statistics, and distinctive words for each text.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> result = compute_vocabulary_overlap(text1, text2)
|
|
37
|
+
>>> print(f"Jaccard similarity: {result.jaccard_similarity:.3f}")
|
|
38
|
+
Jaccard similarity: 0.456
|
|
39
|
+
>>> print(f"Shared words: {result.shared_vocab_size}")
|
|
40
|
+
Shared words: 234
|
|
41
|
+
"""
|
|
42
|
+
# TODO: Implement vocabulary overlap analysis
|
|
43
|
+
# GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21
|
|
44
|
+
raise NotImplementedError(
|
|
45
|
+
"Vocabulary overlap not yet implemented. "
|
|
46
|
+
"See GitHub Issue #21: https://github.com/craigtrim/pystylometry/issues/21"
|
|
47
|
+
)
|
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
"""Syntactic analysis metrics (requires spaCy)."""
|
|
2
2
|
|
|
3
|
+
from .advanced_syntactic import compute_advanced_syntactic
|
|
3
4
|
from .pos_ratios import compute_pos_ratios
|
|
4
5
|
from .sentence_stats import compute_sentence_stats
|
|
6
|
+
from .sentence_types import compute_sentence_types
|
|
5
7
|
|
|
6
8
|
__all__ = [
|
|
7
9
|
"compute_pos_ratios",
|
|
8
10
|
"compute_sentence_stats",
|
|
11
|
+
"compute_advanced_syntactic",
|
|
12
|
+
"compute_sentence_types",
|
|
9
13
|
]
|
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
"""Advanced syntactic analysis using dependency parsing.
|
|
2
|
+
|
|
3
|
+
This module provides sophisticated syntactic metrics beyond basic POS tagging.
|
|
4
|
+
Using dependency parsing, it extracts features related to sentence complexity,
|
|
5
|
+
grammatical sophistication, and syntactic style preferences.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#17 - Advanced Syntactic Analysis
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/17
|
|
10
|
+
|
|
11
|
+
Features implemented:
|
|
12
|
+
- Parse tree depth (sentence structural complexity)
|
|
13
|
+
- T-units (minimal terminable units - independent clauses with modifiers)
|
|
14
|
+
- Clausal density (clauses per T-unit)
|
|
15
|
+
- Dependent clause ratio
|
|
16
|
+
- Passive voice ratio
|
|
17
|
+
- Subordination and coordination indices
|
|
18
|
+
- Dependency distance metrics
|
|
19
|
+
- Branching direction (left vs. right)
|
|
20
|
+
|
|
21
|
+
References:
|
|
22
|
+
Hunt, K. W. (1965). Grammatical structures written at three grade levels.
|
|
23
|
+
NCTE Research Report No. 3.
|
|
24
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
25
|
+
Lu, X. (2010). Automatic analysis of syntactic complexity in second language
|
|
26
|
+
writing. International Journal of Corpus Linguistics, 15(4), 474-496.
|
|
27
|
+
Gibson, E. (2000). The dependency locality theory: A distance-based theory
|
|
28
|
+
of linguistic complexity. In Image, language, brain (pp. 95-126).
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from .._types import AdvancedSyntacticResult
|
|
32
|
+
from .._utils import check_optional_dependency
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def compute_advanced_syntactic(
|
|
36
|
+
text: str,
|
|
37
|
+
model: str = "en_core_web_sm",
|
|
38
|
+
) -> AdvancedSyntacticResult:
|
|
39
|
+
"""
|
|
40
|
+
Compute advanced syntactic complexity metrics using dependency parsing.
|
|
41
|
+
|
|
42
|
+
This function uses spaCy's dependency parser to extract sophisticated
|
|
43
|
+
syntactic features that go beyond simple POS tagging. These features
|
|
44
|
+
capture sentence complexity, grammatical sophistication, and stylistic
|
|
45
|
+
preferences in syntactic structure.
|
|
46
|
+
|
|
47
|
+
Related GitHub Issue:
|
|
48
|
+
#17 - Advanced Syntactic Analysis
|
|
49
|
+
https://github.com/craigtrim/pystylometry/issues/17
|
|
50
|
+
|
|
51
|
+
Why syntactic complexity matters:
|
|
52
|
+
1. Correlates with writing proficiency and cognitive development
|
|
53
|
+
2. Distinguishes between genres (academic vs. conversational)
|
|
54
|
+
3. Captures authorial style preferences
|
|
55
|
+
4. Indicates text difficulty and readability
|
|
56
|
+
5. Varies systematically across languages and registers
|
|
57
|
+
|
|
58
|
+
Metrics computed:
|
|
59
|
+
|
|
60
|
+
Parse Tree Depth:
|
|
61
|
+
- Mean and maximum depth of dependency parse trees
|
|
62
|
+
- Deeper trees = more complex syntactic structures
|
|
63
|
+
- Indicates level of embedding and subordination
|
|
64
|
+
|
|
65
|
+
T-units:
|
|
66
|
+
- Minimal terminable units (Hunt 1965)
|
|
67
|
+
- Independent clause + all dependent clauses attached to it
|
|
68
|
+
- More reliable than sentence length for measuring complexity
|
|
69
|
+
- Mean T-unit length is standard complexity measure
|
|
70
|
+
|
|
71
|
+
Clausal Density:
|
|
72
|
+
- Number of clauses per T-unit
|
|
73
|
+
- Higher density = more complex, embedded structures
|
|
74
|
+
- Academic writing typically has higher clausal density
|
|
75
|
+
|
|
76
|
+
Passive Voice:
|
|
77
|
+
- Ratio of passive constructions to total sentences
|
|
78
|
+
- Academic/formal writing uses more passive voice
|
|
79
|
+
- Fiction/conversational writing uses more active voice
|
|
80
|
+
|
|
81
|
+
Subordination & Coordination:
|
|
82
|
+
- Subordination: Use of dependent clauses
|
|
83
|
+
- Coordination: Use of coordinate clauses (and, but, or)
|
|
84
|
+
- Balance indicates syntactic style
|
|
85
|
+
|
|
86
|
+
Dependency Distance:
|
|
87
|
+
- Average distance between heads and dependents
|
|
88
|
+
- Longer distances = more processing difficulty
|
|
89
|
+
- Related to working memory load
|
|
90
|
+
|
|
91
|
+
Branching Direction:
|
|
92
|
+
- Left-branching: Modifiers before head
|
|
93
|
+
- Right-branching: Modifiers after head
|
|
94
|
+
- English tends toward right-branching
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
text: Input text to analyze. Should contain multiple sentences for
|
|
98
|
+
reliable metrics. Very short texts may have unstable values.
|
|
99
|
+
model: spaCy model name with dependency parser. Default is "en_core_web_sm".
|
|
100
|
+
Larger models (en_core_web_md, en_core_web_lg) may provide better
|
|
101
|
+
parsing accuracy but are slower.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
AdvancedSyntacticResult containing:
|
|
105
|
+
- mean_parse_tree_depth: Average depth across all parse trees
|
|
106
|
+
- max_parse_tree_depth: Maximum depth in any parse tree
|
|
107
|
+
- t_unit_count: Number of T-units detected
|
|
108
|
+
- mean_t_unit_length: Average words per T-unit
|
|
109
|
+
- clausal_density: Clauses per T-unit
|
|
110
|
+
- dependent_clause_ratio: Dependent clauses / total clauses
|
|
111
|
+
- passive_voice_ratio: Passive sentences / total sentences
|
|
112
|
+
- subordination_index: Subordinate clauses / total clauses
|
|
113
|
+
- coordination_index: Coordinate clauses / total clauses
|
|
114
|
+
- sentence_complexity_score: Composite complexity metric
|
|
115
|
+
- dependency_distance: Mean distance between heads and dependents
|
|
116
|
+
- left_branching_ratio: Left-branching structures / total
|
|
117
|
+
- right_branching_ratio: Right-branching structures / total
|
|
118
|
+
- metadata: Parse tree details, clause counts, etc.
|
|
119
|
+
|
|
120
|
+
Example:
|
|
121
|
+
>>> result = compute_advanced_syntactic("Complex multi-clause text...")
|
|
122
|
+
>>> print(f"Parse tree depth: {result.mean_parse_tree_depth:.1f}")
|
|
123
|
+
Parse tree depth: 5.3
|
|
124
|
+
>>> print(f"T-units: {result.t_unit_count}")
|
|
125
|
+
T-units: 12
|
|
126
|
+
>>> print(f"Clausal density: {result.clausal_density:.2f}")
|
|
127
|
+
Clausal density: 2.4
|
|
128
|
+
>>> print(f"Passive voice: {result.passive_voice_ratio * 100:.1f}%")
|
|
129
|
+
Passive voice: 23.5%
|
|
130
|
+
|
|
131
|
+
>>> # Compare genres
|
|
132
|
+
>>> academic = compute_advanced_syntactic("Academic paper...")
|
|
133
|
+
>>> fiction = compute_advanced_syntactic("Fiction narrative...")
|
|
134
|
+
>>> print(f"Academic clausal density: {academic.clausal_density:.2f}")
|
|
135
|
+
>>> print(f"Fiction clausal density: {fiction.clausal_density:.2f}")
|
|
136
|
+
>>> # Academic typically higher
|
|
137
|
+
|
|
138
|
+
Note:
|
|
139
|
+
- Requires spaCy with dependency parser (small model minimum)
|
|
140
|
+
- Parse accuracy affects metrics (larger models are better)
|
|
141
|
+
- Very long sentences may have parsing errors
|
|
142
|
+
- Passive voice detection uses dependency patterns
|
|
143
|
+
- T-unit segmentation follows Hunt (1965) criteria
|
|
144
|
+
- Empty or very short texts return NaN for ratios
|
|
145
|
+
"""
|
|
146
|
+
check_optional_dependency("spacy", "syntactic")
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
import spacy # type: ignore
|
|
150
|
+
from spacy.tokens import Doc, Span, Token # type: ignore
|
|
151
|
+
except ImportError as e:
|
|
152
|
+
raise ImportError(
|
|
153
|
+
"spaCy is required for advanced syntactic analysis. "
|
|
154
|
+
"Install with: pip install spacy && python -m spacy download en_core_web_sm"
|
|
155
|
+
) from e
|
|
156
|
+
|
|
157
|
+
# Load spaCy model
|
|
158
|
+
try:
|
|
159
|
+
nlp = spacy.load(model)
|
|
160
|
+
except OSError as e:
|
|
161
|
+
raise OSError(
|
|
162
|
+
f"spaCy model '{model}' not found. "
|
|
163
|
+
f"Download with: python -m spacy download {model}"
|
|
164
|
+
) from e
|
|
165
|
+
|
|
166
|
+
# Parse text
|
|
167
|
+
doc = nlp(text)
|
|
168
|
+
sentences = list(doc.sents)
|
|
169
|
+
|
|
170
|
+
# Handle empty text
|
|
171
|
+
if len(sentences) == 0 or len(doc) == 0:
|
|
172
|
+
return AdvancedSyntacticResult(
|
|
173
|
+
mean_parse_tree_depth=float("nan"),
|
|
174
|
+
max_parse_tree_depth=0,
|
|
175
|
+
t_unit_count=0,
|
|
176
|
+
mean_t_unit_length=float("nan"),
|
|
177
|
+
clausal_density=float("nan"),
|
|
178
|
+
dependent_clause_ratio=float("nan"),
|
|
179
|
+
passive_voice_ratio=float("nan"),
|
|
180
|
+
subordination_index=float("nan"),
|
|
181
|
+
coordination_index=float("nan"),
|
|
182
|
+
sentence_complexity_score=float("nan"),
|
|
183
|
+
dependency_distance=float("nan"),
|
|
184
|
+
left_branching_ratio=float("nan"),
|
|
185
|
+
right_branching_ratio=float("nan"),
|
|
186
|
+
metadata={
|
|
187
|
+
"sentence_count": 0,
|
|
188
|
+
"word_count": 0,
|
|
189
|
+
"total_clauses": 0,
|
|
190
|
+
"warning": "Empty text or no sentences found",
|
|
191
|
+
},
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# 1. Calculate parse tree depth
|
|
195
|
+
parse_depths = []
|
|
196
|
+
for sent in sentences:
|
|
197
|
+
depth = _calculate_max_tree_depth(sent.root)
|
|
198
|
+
parse_depths.append(depth)
|
|
199
|
+
|
|
200
|
+
mean_parse_tree_depth = sum(parse_depths) / len(parse_depths)
|
|
201
|
+
max_parse_tree_depth = max(parse_depths)
|
|
202
|
+
|
|
203
|
+
# 2. Calculate mean dependency distance
|
|
204
|
+
dependency_distances = []
|
|
205
|
+
for token in doc:
|
|
206
|
+
if token != token.head: # Exclude root
|
|
207
|
+
distance = abs(token.i - token.head.i)
|
|
208
|
+
dependency_distances.append(distance)
|
|
209
|
+
|
|
210
|
+
if dependency_distances:
|
|
211
|
+
mean_dependency_distance = sum(dependency_distances) / len(dependency_distances)
|
|
212
|
+
else:
|
|
213
|
+
mean_dependency_distance = 0.0
|
|
214
|
+
|
|
215
|
+
# 3. Identify T-units and calculate mean T-unit length
|
|
216
|
+
t_units = _identify_t_units(doc)
|
|
217
|
+
t_unit_count = len(t_units)
|
|
218
|
+
t_unit_lengths = [len(t_unit) for t_unit in t_units]
|
|
219
|
+
|
|
220
|
+
if t_unit_count > 0:
|
|
221
|
+
mean_t_unit_length = sum(t_unit_lengths) / t_unit_count
|
|
222
|
+
else:
|
|
223
|
+
mean_t_unit_length = float("nan")
|
|
224
|
+
|
|
225
|
+
# 4. Count clauses (total, dependent, subordinate, coordinate)
|
|
226
|
+
total_clauses = 0
|
|
227
|
+
dependent_clause_count = 0
|
|
228
|
+
subordinate_clause_count = 0
|
|
229
|
+
coordinate_clause_count = 0
|
|
230
|
+
|
|
231
|
+
for sent in sentences:
|
|
232
|
+
sent_total, sent_dependent, sent_subordinate, sent_coordinate = _count_clauses(
|
|
233
|
+
sent
|
|
234
|
+
)
|
|
235
|
+
total_clauses += sent_total
|
|
236
|
+
dependent_clause_count += sent_dependent
|
|
237
|
+
subordinate_clause_count += sent_subordinate
|
|
238
|
+
coordinate_clause_count += sent_coordinate
|
|
239
|
+
|
|
240
|
+
# Calculate ratios
|
|
241
|
+
if total_clauses > 0:
|
|
242
|
+
dependent_clause_ratio = dependent_clause_count / total_clauses
|
|
243
|
+
subordination_index = subordinate_clause_count / total_clauses
|
|
244
|
+
coordination_index = coordinate_clause_count / total_clauses
|
|
245
|
+
else:
|
|
246
|
+
dependent_clause_ratio = float("nan")
|
|
247
|
+
subordination_index = float("nan")
|
|
248
|
+
coordination_index = float("nan")
|
|
249
|
+
|
|
250
|
+
if t_unit_count > 0:
|
|
251
|
+
clausal_density = total_clauses / t_unit_count
|
|
252
|
+
else:
|
|
253
|
+
clausal_density = float("nan")
|
|
254
|
+
|
|
255
|
+
# 5. Detect passive voice
|
|
256
|
+
passive_sentence_count = sum(1 for sent in sentences if _is_passive_voice(sent))
|
|
257
|
+
passive_voice_ratio = passive_sentence_count / len(sentences)
|
|
258
|
+
|
|
259
|
+
# 6. Calculate branching direction
|
|
260
|
+
left_branching = 0
|
|
261
|
+
right_branching = 0
|
|
262
|
+
|
|
263
|
+
for token in doc:
|
|
264
|
+
if token != token.head: # Exclude root
|
|
265
|
+
if token.i < token.head.i:
|
|
266
|
+
left_branching += 1
|
|
267
|
+
else:
|
|
268
|
+
right_branching += 1
|
|
269
|
+
|
|
270
|
+
total_branching = left_branching + right_branching
|
|
271
|
+
if total_branching > 0:
|
|
272
|
+
left_branching_ratio = left_branching / total_branching
|
|
273
|
+
right_branching_ratio = right_branching / total_branching
|
|
274
|
+
else:
|
|
275
|
+
left_branching_ratio = float("nan")
|
|
276
|
+
right_branching_ratio = float("nan")
|
|
277
|
+
|
|
278
|
+
# 7. Calculate sentence complexity score (composite metric)
|
|
279
|
+
# Normalize individual metrics to 0-1 range
|
|
280
|
+
normalized_parse_depth = min(mean_parse_tree_depth / 10, 1.0)
|
|
281
|
+
normalized_clausal_density = (
|
|
282
|
+
min(clausal_density / 3, 1.0) if not isinstance(clausal_density, float) or not (clausal_density != clausal_density) else 0.0
|
|
283
|
+
)
|
|
284
|
+
normalized_t_unit_length = (
|
|
285
|
+
min(mean_t_unit_length / 25, 1.0) if not isinstance(mean_t_unit_length, float) or not (mean_t_unit_length != mean_t_unit_length) else 0.0
|
|
286
|
+
)
|
|
287
|
+
normalized_dependency_distance = min(mean_dependency_distance / 5, 1.0)
|
|
288
|
+
normalized_subordination = (
|
|
289
|
+
subordination_index if not isinstance(subordination_index, float) or not (subordination_index != subordination_index) else 0.0
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# Weighted combination
|
|
293
|
+
sentence_complexity_score = (
|
|
294
|
+
0.3 * normalized_parse_depth
|
|
295
|
+
+ 0.3 * normalized_clausal_density
|
|
296
|
+
+ 0.2 * normalized_t_unit_length
|
|
297
|
+
+ 0.1 * normalized_subordination
|
|
298
|
+
+ 0.1 * normalized_dependency_distance
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Collect metadata
|
|
302
|
+
metadata = {
|
|
303
|
+
"sentence_count": len(sentences),
|
|
304
|
+
"word_count": len(doc),
|
|
305
|
+
"total_clauses": total_clauses,
|
|
306
|
+
"independent_clause_count": total_clauses - dependent_clause_count,
|
|
307
|
+
"dependent_clause_count": dependent_clause_count,
|
|
308
|
+
"subordinate_clause_count": subordinate_clause_count,
|
|
309
|
+
"coordinate_clause_count": coordinate_clause_count,
|
|
310
|
+
"passive_sentence_count": passive_sentence_count,
|
|
311
|
+
"parse_depths_per_sentence": parse_depths,
|
|
312
|
+
"t_unit_lengths": t_unit_lengths,
|
|
313
|
+
"t_unit_count": t_unit_count,
|
|
314
|
+
"dependency_distances": dependency_distances[:100], # Sample for brevity
|
|
315
|
+
"left_branching_count": left_branching,
|
|
316
|
+
"right_branching_count": right_branching,
|
|
317
|
+
"model_used": model,
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return AdvancedSyntacticResult(
|
|
321
|
+
mean_parse_tree_depth=mean_parse_tree_depth,
|
|
322
|
+
max_parse_tree_depth=max_parse_tree_depth,
|
|
323
|
+
t_unit_count=t_unit_count,
|
|
324
|
+
mean_t_unit_length=mean_t_unit_length,
|
|
325
|
+
clausal_density=clausal_density,
|
|
326
|
+
dependent_clause_ratio=dependent_clause_ratio,
|
|
327
|
+
passive_voice_ratio=passive_voice_ratio,
|
|
328
|
+
subordination_index=subordination_index,
|
|
329
|
+
coordination_index=coordination_index,
|
|
330
|
+
sentence_complexity_score=sentence_complexity_score,
|
|
331
|
+
dependency_distance=mean_dependency_distance,
|
|
332
|
+
left_branching_ratio=left_branching_ratio,
|
|
333
|
+
right_branching_ratio=right_branching_ratio,
|
|
334
|
+
metadata=metadata,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _calculate_max_tree_depth(token) -> int:
|
|
339
|
+
"""
|
|
340
|
+
Calculate maximum depth of dependency tree starting from token.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
token: spaCy Token to start from (typically sentence root)
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
Maximum depth of tree (root = 0, children = parent + 1)
|
|
347
|
+
"""
|
|
348
|
+
if not list(token.children):
|
|
349
|
+
return 0
|
|
350
|
+
|
|
351
|
+
child_depths = [_calculate_max_tree_depth(child) for child in token.children]
|
|
352
|
+
return max(child_depths) + 1
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def _identify_t_units(doc) -> list:
|
|
356
|
+
"""
|
|
357
|
+
Identify T-units (minimal terminable units) in document.
|
|
358
|
+
|
|
359
|
+
A T-unit is one main clause plus all subordinate clauses attached to it.
|
|
360
|
+
This follows Hunt (1965) definition.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
doc: spaCy Doc object
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
List of spaCy Span objects, each representing a T-unit
|
|
367
|
+
"""
|
|
368
|
+
# For simplicity, treat each sentence as a T-unit
|
|
369
|
+
# More sophisticated approach would split compound sentences
|
|
370
|
+
# into separate T-units, but this requires complex coordination analysis
|
|
371
|
+
return list(doc.sents)
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def _count_clauses(sent) -> tuple[int, int, int, int]:
|
|
375
|
+
"""
|
|
376
|
+
Count different types of clauses in sentence.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
sent: spaCy Span representing a sentence
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Tuple of (total_clauses, dependent_clauses, subordinate_clauses, coordinate_clauses)
|
|
383
|
+
"""
|
|
384
|
+
# Start with 1 for the main clause
|
|
385
|
+
total = 1
|
|
386
|
+
dependent = 0
|
|
387
|
+
subordinate = 0
|
|
388
|
+
coordinate = 0
|
|
389
|
+
|
|
390
|
+
# Dependency labels that indicate clauses
|
|
391
|
+
dependent_clause_labels = {"csubj", "ccomp", "xcomp", "advcl", "acl", "relcl"}
|
|
392
|
+
subordinate_clause_labels = {"advcl", "acl", "relcl"}
|
|
393
|
+
coordinate_clause_labels = {"conj"}
|
|
394
|
+
|
|
395
|
+
for token in sent:
|
|
396
|
+
if token.dep_ in dependent_clause_labels:
|
|
397
|
+
total += 1
|
|
398
|
+
dependent += 1
|
|
399
|
+
if token.dep_ in subordinate_clause_labels:
|
|
400
|
+
subordinate += 1
|
|
401
|
+
elif token.dep_ in coordinate_clause_labels and token.pos_ == "VERB":
|
|
402
|
+
# Coordinate clause (conj) with verb = coordinated main clause
|
|
403
|
+
total += 1
|
|
404
|
+
coordinate += 1
|
|
405
|
+
|
|
406
|
+
return total, dependent, subordinate, coordinate
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _is_passive_voice(sent) -> bool:
|
|
410
|
+
"""
|
|
411
|
+
Detect if sentence contains passive voice construction.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
sent: spaCy Span representing a sentence
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
True if passive voice detected, False otherwise
|
|
418
|
+
"""
|
|
419
|
+
# Look for passive auxiliary + past participle pattern
|
|
420
|
+
for token in sent:
|
|
421
|
+
# Check for passive subject dependency (older spaCy versions)
|
|
422
|
+
if token.dep_ == "nsubjpass":
|
|
423
|
+
return True
|
|
424
|
+
# Check for passive auxiliary + past participle (newer spaCy versions)
|
|
425
|
+
# In newer spaCy, passive is marked with nsubj:pass or through aux:pass
|
|
426
|
+
if "pass" in token.dep_:
|
|
427
|
+
return True
|
|
428
|
+
# Alternative: check for "be" verb + past participle
|
|
429
|
+
if token.dep_ == "auxpass":
|
|
430
|
+
return True
|
|
431
|
+
|
|
432
|
+
return False
|