bib2graph 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bib2graph/__init__.py +20 -0
- bib2graph/analisis_red.py +645 -0
- bib2graph/consigue_los_articulos.py +248 -0
- bib2graph/enriquecimiento.py +300 -0
- bib2graph/models.py +73 -0
- bib2graph/queries.py +7 -0
- bib2graph-0.1.0.dist-info/LICENSE +190 -0
- bib2graph-0.1.0.dist-info/METADATA +206 -0
- bib2graph-0.1.0.dist-info/RECORD +11 -0
- bib2graph-0.1.0.dist-info/WHEEL +4 -0
- bib2graph-0.1.0.dist-info/entry_points.txt +3 -0
bib2graph/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
bib2graph - A Python package for bibliometric data processing and network analysis.
|
|
3
|
+
|
|
4
|
+
This package provides tools for:
|
|
5
|
+
1. Ingesting bibliographic data from various sources
|
|
6
|
+
2. Enriching data with additional metadata from external APIs
|
|
7
|
+
3. Extracting and analyzing bibliometric networks
|
|
8
|
+
|
|
9
|
+
Main components:
|
|
10
|
+
- BibliometricDataLoader: For loading and normalizing bibliographic data
|
|
11
|
+
- BibliometricDataEnricher: For enriching data with additional metadata
|
|
12
|
+
- BibliometricNetworkAnalyzer: For extracting and analyzing networks
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from bib2graph.consigue_los_articulos import BibliometricDataLoader
|
|
16
|
+
from bib2graph.enriquecimiento import BibliometricDataEnricher
|
|
17
|
+
from bib2graph.analisis_red import BibliometricNetworkAnalyzer
|
|
18
|
+
from bib2graph.models import *
|
|
19
|
+
|
|
20
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,645 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Network analysis module for bibliometric analysis of semiconductor supply chain.
|
|
3
|
+
|
|
4
|
+
This module provides functionality for extracting co-citation networks from the Neo4j database
|
|
5
|
+
and exporting them for analysis in external tools.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import networkx as nx
|
|
10
|
+
from neomodel import db, config
|
|
11
|
+
from typing import Dict, Any, Tuple, Set
|
|
12
|
+
|
|
13
|
+
# Neo4j connection parameters
|
|
14
|
+
NEO4J_URI = "bolt://localhost:7687"
|
|
15
|
+
NEO4J_USER = "neo4j"
|
|
16
|
+
NEO4J_PASSWORD = "password" # Change this in production
|
|
17
|
+
|
|
18
|
+
# Configure neomodel connection
|
|
19
|
+
config.DATABASE_URL = f"bolt://{NEO4J_USER}:{NEO4J_PASSWORD}@localhost:7687"
|
|
20
|
+
|
|
21
|
+
class BibliometricNetworkAnalyzer:
|
|
22
|
+
"""Class for analyzing bibliometric networks from Neo4j database."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, uri: str = NEO4J_URI, user: str = NEO4J_USER, password: str = NEO4J_PASSWORD):
|
|
25
|
+
"""Initialize the network analyzer with Neo4j connection parameters.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
uri: Neo4j connection URI
|
|
29
|
+
user: Neo4j username
|
|
30
|
+
password: Neo4j password
|
|
31
|
+
"""
|
|
32
|
+
# Configure neomodel connection
|
|
33
|
+
config.DATABASE_URL = f"bolt://{user}:{password}@{uri.replace('bolt://', '')}"
|
|
34
|
+
|
|
35
|
+
def create_co_citation_relationships(self) -> int:
|
|
36
|
+
"""Create CO_CITED_WITH relationships in Neo4j based on shared references.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Number of CO_CITED_WITH relationships created
|
|
40
|
+
"""
|
|
41
|
+
cypher_query = """
|
|
42
|
+
MATCH (p1:Paper {is_seed: True})-[:REFERENCES]->(ref:Paper)<-[:REFERENCES]-(p2:Paper {is_seed: True})
|
|
43
|
+
WHERE p1 <> p2
|
|
44
|
+
WITH p1, p2, COUNT(ref) AS shared_refs
|
|
45
|
+
WHERE shared_refs > 0
|
|
46
|
+
MERGE (p1)-[r:CO_CITED_WITH]-(p2)
|
|
47
|
+
ON CREATE SET r.weight = shared_refs
|
|
48
|
+
RETURN COUNT(r) AS relationship_count
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
results, meta = db.cypher_query(cypher_query)
|
|
52
|
+
return results[0][0] if results else 0
|
|
53
|
+
|
|
54
|
+
def generate_quality_report(self, dois_set: Set[str]) -> Dict[str, Any]:
|
|
55
|
+
"""Generate a quality report for the co-citation network.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
dois_set: Set of DOIs involved in the co-citation network
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Dictionary containing quality metrics
|
|
62
|
+
"""
|
|
63
|
+
report = {}
|
|
64
|
+
|
|
65
|
+
# 1. Document volume check
|
|
66
|
+
report["document_count"] = len(dois_set)
|
|
67
|
+
report["meets_volume_threshold"] = report["document_count"] >= 200
|
|
68
|
+
|
|
69
|
+
# 2. DOI and references percentage
|
|
70
|
+
if dois_set:
|
|
71
|
+
doi_ref_query = """
|
|
72
|
+
MATCH (p:Paper)
|
|
73
|
+
WHERE p.doi IN $dois
|
|
74
|
+
RETURN COUNT(p) AS total,
|
|
75
|
+
SUM(
|
|
76
|
+
CASE
|
|
77
|
+
WHEN p.doi IS NOT NULL AND EXISTS { (p)-[:REFERENCES]->() }
|
|
78
|
+
THEN 1
|
|
79
|
+
ELSE 0
|
|
80
|
+
END
|
|
81
|
+
) AS with_doi_and_refs
|
|
82
|
+
"""
|
|
83
|
+
results, _ = db.cypher_query(doi_ref_query, {"dois": list(dois_set)})
|
|
84
|
+
total = results[0][0] if results and results[0][0] is not None else 0
|
|
85
|
+
with_doi_and_refs = results[0][1] if results and results[0][1] is not None else 0
|
|
86
|
+
|
|
87
|
+
if total > 0:
|
|
88
|
+
report["doi_ref_percentage"] = (with_doi_and_refs / total) * 100
|
|
89
|
+
else:
|
|
90
|
+
report["doi_ref_percentage"] = 0
|
|
91
|
+
|
|
92
|
+
report["meets_doi_ref_threshold"] = report["doi_ref_percentage"] >= 90
|
|
93
|
+
else:
|
|
94
|
+
report["doi_ref_percentage"] = 0
|
|
95
|
+
report["meets_doi_ref_threshold"] = False
|
|
96
|
+
|
|
97
|
+
# 3. Temporal coverage
|
|
98
|
+
if dois_set:
|
|
99
|
+
year_query = """
|
|
100
|
+
MATCH (p:Paper)
|
|
101
|
+
WHERE p.doi IN $dois AND p.year IS NOT NULL
|
|
102
|
+
RETURN MIN(toInteger(p.year)) AS min_year,
|
|
103
|
+
MAX(toInteger(p.year)) AS max_year,
|
|
104
|
+
COUNT(DISTINCT p.year) AS unique_years
|
|
105
|
+
"""
|
|
106
|
+
results, _ = db.cypher_query(year_query, {"dois": list(dois_set)})
|
|
107
|
+
|
|
108
|
+
if results and results[0][0] is not None:
|
|
109
|
+
report["min_year"] = results[0][0]
|
|
110
|
+
report["max_year"] = results[0][1]
|
|
111
|
+
report["unique_years"] = results[0][2]
|
|
112
|
+
report["temporal_coverage"] = f"{report['min_year']}–{report['max_year']}"
|
|
113
|
+
# Check if coverage includes 2000-2024
|
|
114
|
+
report["meets_temporal_threshold"] = (
|
|
115
|
+
report["min_year"] <= 2000 and report["max_year"] >= 2024
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
report["temporal_coverage"] = "No data"
|
|
119
|
+
report["meets_temporal_threshold"] = False
|
|
120
|
+
else:
|
|
121
|
+
report["temporal_coverage"] = "No data"
|
|
122
|
+
report["meets_temporal_threshold"] = False
|
|
123
|
+
|
|
124
|
+
# 4. Geographic diversity
|
|
125
|
+
if dois_set:
|
|
126
|
+
country_query = """
|
|
127
|
+
MATCH (p:Paper)-[:AUTHORED]->(a:Author)-[:AFFILIATED_WITH]->(i:Institution)
|
|
128
|
+
WHERE p.doi IN $dois
|
|
129
|
+
RETURN COUNT(DISTINCT i.address) AS country_count
|
|
130
|
+
"""
|
|
131
|
+
results, _ = db.cypher_query(country_query, {"dois": list(dois_set)})
|
|
132
|
+
|
|
133
|
+
report["country_count"] = results[0][0] if results and results[0][0] is not None else 0
|
|
134
|
+
report["meets_geographic_threshold"] = report["country_count"] >= 5
|
|
135
|
+
else:
|
|
136
|
+
report["country_count"] = 0
|
|
137
|
+
report["meets_geographic_threshold"] = False
|
|
138
|
+
|
|
139
|
+
# 5. Key author participation
|
|
140
|
+
if dois_set:
|
|
141
|
+
author_query = """
|
|
142
|
+
MATCH (p:Paper)-[:AUTHORED]->(a:Author)
|
|
143
|
+
WHERE p.doi IN $dois
|
|
144
|
+
WITH a, COUNT(p) AS paper_count
|
|
145
|
+
WHERE paper_count > 1
|
|
146
|
+
RETURN COUNT(a) AS recurring_authors
|
|
147
|
+
"""
|
|
148
|
+
results, _ = db.cypher_query(author_query, {"dois": list(dois_set)})
|
|
149
|
+
|
|
150
|
+
report["recurring_authors"] = results[0][0] if results and results[0][0] is not None else 0
|
|
151
|
+
report["meets_author_threshold"] = report["recurring_authors"] >= 10
|
|
152
|
+
|
|
153
|
+
# Get top authors for the report
|
|
154
|
+
top_authors_query = """
|
|
155
|
+
MATCH (p:Paper)-[:AUTHORED]->(a:Author)
|
|
156
|
+
WHERE p.doi IN $dois
|
|
157
|
+
WITH a, COUNT(p) AS paper_count
|
|
158
|
+
ORDER BY paper_count DESC
|
|
159
|
+
LIMIT 10
|
|
160
|
+
RETURN a.name AS author_name, paper_count
|
|
161
|
+
"""
|
|
162
|
+
results, meta = db.cypher_query(top_authors_query, {"dois": list(dois_set)})
|
|
163
|
+
columns = [col for col in meta]
|
|
164
|
+
|
|
165
|
+
top_authors = []
|
|
166
|
+
for row in results:
|
|
167
|
+
record = dict(zip(columns, row))
|
|
168
|
+
top_authors.append({
|
|
169
|
+
"name": record["author_name"],
|
|
170
|
+
"paper_count": record["paper_count"]
|
|
171
|
+
})
|
|
172
|
+
|
|
173
|
+
report["top_authors"] = top_authors
|
|
174
|
+
else:
|
|
175
|
+
report["recurring_authors"] = 0
|
|
176
|
+
report["meets_author_threshold"] = False
|
|
177
|
+
report["top_authors"] = []
|
|
178
|
+
|
|
179
|
+
# 6. Source duplication level
|
|
180
|
+
if dois_set:
|
|
181
|
+
source_query = """
|
|
182
|
+
MATCH (p:Paper)
|
|
183
|
+
WHERE p.doi IN $dois
|
|
184
|
+
RETURN COUNT(p) AS total,
|
|
185
|
+
COUNT(DISTINCT p.source) AS unique_sources
|
|
186
|
+
"""
|
|
187
|
+
results, _ = db.cypher_query(source_query, {"dois": list(dois_set)})
|
|
188
|
+
|
|
189
|
+
total = results[0][0] if results and results[0][0] is not None else 0
|
|
190
|
+
unique_sources = results[0][1] if results and results[0][1] is not None else 0
|
|
191
|
+
|
|
192
|
+
if total > 0:
|
|
193
|
+
report["source_duplication_percentage"] = ((total - unique_sources) / total) * 100
|
|
194
|
+
else:
|
|
195
|
+
report["source_duplication_percentage"] = 0
|
|
196
|
+
else:
|
|
197
|
+
report["source_duplication_percentage"] = 0
|
|
198
|
+
|
|
199
|
+
# 7. Missing data quality
|
|
200
|
+
if dois_set:
|
|
201
|
+
missing_data_query = """
|
|
202
|
+
MATCH (p:Paper)
|
|
203
|
+
WHERE p.doi IN $dois
|
|
204
|
+
RETURN
|
|
205
|
+
COUNT(p) AS total,
|
|
206
|
+
SUM(CASE WHEN p.title IS NULL THEN 1 ELSE 0 END) AS missing_title,
|
|
207
|
+
SUM(CASE WHEN p.year IS NULL THEN 1 ELSE 0 END) AS missing_year,
|
|
208
|
+
SUM(CASE WHEN p.abstract IS NULL THEN 1 ELSE 0 END) AS missing_abstract,
|
|
209
|
+
SUM(CASE WHEN NOT EXISTS((p)-[:AUTHORED]->()) THEN 1 ELSE 0 END) AS missing_authors,
|
|
210
|
+
SUM(CASE WHEN NOT EXISTS((p)-[:HAS_KEYWORD]->()) THEN 1 ELSE 0 END) AS missing_keywords
|
|
211
|
+
"""
|
|
212
|
+
results, meta = db.cypher_query(missing_data_query, {"dois": list(dois_set)})
|
|
213
|
+
columns = [col for col in meta]
|
|
214
|
+
|
|
215
|
+
if results:
|
|
216
|
+
record = dict(zip(columns, results[0]))
|
|
217
|
+
total = record["total"] if record["total"] is not None else 0
|
|
218
|
+
|
|
219
|
+
missing_data = {}
|
|
220
|
+
for field in ["title", "year", "abstract", "authors", "keywords"]:
|
|
221
|
+
field_key = f"missing_{field}"
|
|
222
|
+
if total > 0 and record[field_key] is not None:
|
|
223
|
+
missing_data[field] = (record[field_key] / total) * 100
|
|
224
|
+
else:
|
|
225
|
+
missing_data[field] = 0
|
|
226
|
+
|
|
227
|
+
report["missing_data_percentages"] = missing_data
|
|
228
|
+
else:
|
|
229
|
+
report["missing_data_percentages"] = {
|
|
230
|
+
"title": 0, "year": 0, "abstract": 0, "authors": 0, "keywords": 0
|
|
231
|
+
}
|
|
232
|
+
else:
|
|
233
|
+
report["missing_data_percentages"] = {
|
|
234
|
+
"title": 0, "year": 0, "abstract": 0, "authors": 0, "keywords": 0
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
# Overall quality assessment
|
|
238
|
+
criteria_met = [
|
|
239
|
+
report["meets_volume_threshold"],
|
|
240
|
+
report["meets_doi_ref_threshold"],
|
|
241
|
+
report["meets_temporal_threshold"],
|
|
242
|
+
report["meets_geographic_threshold"],
|
|
243
|
+
report["meets_author_threshold"]
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
report["criteria_met_count"] = sum(1 for c in criteria_met if c)
|
|
247
|
+
report["criteria_total_count"] = len(criteria_met)
|
|
248
|
+
report["quality_score"] = (report["criteria_met_count"] / report["criteria_total_count"]) * 100 if report["criteria_total_count"] > 0 else 0
|
|
249
|
+
|
|
250
|
+
return report
|
|
251
|
+
|
|
252
|
+
def extract_co_citation_network(self, min_weight: int = 1) -> Tuple[nx.Graph, Dict[str, Any]]:
|
|
253
|
+
"""Extrae la red de cocitación a partir de Neo4j, sólo con papers con relaciones.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
min_weight: Minimum weight for co-citation relationships
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Tuple containing:
|
|
260
|
+
- NetworkX graph representing the co-citation network
|
|
261
|
+
- Dictionary containing quality report metrics
|
|
262
|
+
"""
|
|
263
|
+
G = nx.Graph()
|
|
264
|
+
|
|
265
|
+
# 1. Primero recuperamos solo las relaciones relevantes (edges)
|
|
266
|
+
cocitation_query = """
|
|
267
|
+
MATCH (p1:Paper)-[r:CO_CITED_WITH]-(p2:Paper)
|
|
268
|
+
WHERE r.weight >= $min_weight
|
|
269
|
+
RETURN p1.doi AS source, p2.doi AS target, r.weight AS weight
|
|
270
|
+
"""
|
|
271
|
+
cocit_results, cocit_meta = db.cypher_query(cocitation_query, {"min_weight": min_weight})
|
|
272
|
+
columns = [col for col in cocit_meta]
|
|
273
|
+
# Guardar el set de DOIs involucrados en cocitación
|
|
274
|
+
dois_set = set()
|
|
275
|
+
edges = []
|
|
276
|
+
for row in cocit_results:
|
|
277
|
+
record = dict(zip(columns, row))
|
|
278
|
+
if record['source'] and record['target'] and record['weight'] is not None:
|
|
279
|
+
# Aseguramos que no haya None en weight
|
|
280
|
+
edges.append((record['source'], record['target'], record['weight']))
|
|
281
|
+
dois_set.add(record['source'])
|
|
282
|
+
dois_set.add(record['target'])
|
|
283
|
+
|
|
284
|
+
if dois_set:
|
|
285
|
+
paper_query = """
|
|
286
|
+
MATCH (p:Paper)
|
|
287
|
+
WHERE p.doi IN $dois
|
|
288
|
+
RETURN p.doi AS doi, p.title AS title, p.year AS year
|
|
289
|
+
"""
|
|
290
|
+
paper_results, paper_meta = db.cypher_query(paper_query, {"dois": list(dois_set)})
|
|
291
|
+
col_paper = [col for col in paper_meta]
|
|
292
|
+
for row in paper_results:
|
|
293
|
+
record = dict(zip(col_paper, row))
|
|
294
|
+
# Limpiar/asegurar que fields no sean None (GraphML NO soporta None)
|
|
295
|
+
doi = record['doi']
|
|
296
|
+
title = record['title'] if record['title'] is not None else ""
|
|
297
|
+
year = record['year'] if record['year'] is not None else -1
|
|
298
|
+
G.add_node(doi, title=title, year=year)
|
|
299
|
+
|
|
300
|
+
for source, target, weight in edges:
|
|
301
|
+
G.add_edge(source, target, weight=weight)
|
|
302
|
+
|
|
303
|
+
# Generate quality report
|
|
304
|
+
quality_report = self.generate_quality_report(dois_set)
|
|
305
|
+
|
|
306
|
+
return G, quality_report
|
|
307
|
+
|
|
308
|
+
def extract_author_collaboration_network(self) -> nx.Graph:
|
|
309
|
+
"""Extract author collaboration network from Neo4j.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
NetworkX graph representing the author collaboration network
|
|
313
|
+
"""
|
|
314
|
+
# Create an empty undirected graph
|
|
315
|
+
G = nx.Graph()
|
|
316
|
+
|
|
317
|
+
# Cypher query to get authors
|
|
318
|
+
author_query = """
|
|
319
|
+
MATCH (a:Author)
|
|
320
|
+
RETURN a.name AS name, a.orcid AS orcid
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
# Cypher query to get collaboration relationships
|
|
324
|
+
collaboration_query = """
|
|
325
|
+
MATCH (a1:Author)-[:AUTHORED]->(p:Paper)<-[:AUTHORED]-(a2:Author)
|
|
326
|
+
WHERE a1 <> a2
|
|
327
|
+
WITH a1, a2, COUNT(p) AS collaboration_count
|
|
328
|
+
RETURN a1.name AS source, a2.name AS target, collaboration_count AS weight
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
# Add nodes (authors)
|
|
332
|
+
results, meta = db.cypher_query(author_query)
|
|
333
|
+
# Convert results to dictionary format
|
|
334
|
+
columns = [col for col in meta]
|
|
335
|
+
for row in results:
|
|
336
|
+
record = dict(zip(columns, row))
|
|
337
|
+
G.add_node(
|
|
338
|
+
record['name'],
|
|
339
|
+
orcid=record['orcid']
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
# Add edges (collaboration relationships)
|
|
343
|
+
results, meta = db.cypher_query(collaboration_query)
|
|
344
|
+
# Convert results to dictionary format
|
|
345
|
+
columns = [col for col in meta]
|
|
346
|
+
for row in results:
|
|
347
|
+
record = dict(zip(columns, row))
|
|
348
|
+
G.add_edge(
|
|
349
|
+
record['source'],
|
|
350
|
+
record['target'],
|
|
351
|
+
weight=record['weight']
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
return G
|
|
355
|
+
|
|
356
|
+
def extract_institution_collaboration_network(self) -> nx.Graph:
|
|
357
|
+
"""Extract institution collaboration network from Neo4j.
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
NetworkX graph representing the institution collaboration network
|
|
361
|
+
"""
|
|
362
|
+
# Create an empty undirected graph
|
|
363
|
+
G = nx.Graph()
|
|
364
|
+
|
|
365
|
+
# Cypher query to get institutions
|
|
366
|
+
institution_query = """
|
|
367
|
+
MATCH (i:Institution)
|
|
368
|
+
RETURN i.name AS name
|
|
369
|
+
"""
|
|
370
|
+
|
|
371
|
+
# Cypher query to get collaboration relationships
|
|
372
|
+
collaboration_query = """
|
|
373
|
+
MATCH (i1:Institution)<-[:AFFILIATED_WITH]-(a1:Author)-[:AUTHORED]->(p:Paper)<-[:AUTHORED]-(a2:Author)-[:AFFILIATED_WITH]->(i2:Institution)
|
|
374
|
+
WHERE i1 <> i2
|
|
375
|
+
WITH i1, i2, COUNT(DISTINCT p) AS collaboration_count
|
|
376
|
+
RETURN i1.name AS source, i2.name AS target, collaboration_count AS weight
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
# Add nodes (institutions)
|
|
380
|
+
results, meta = db.cypher_query(institution_query)
|
|
381
|
+
# Convert results to dictionary format
|
|
382
|
+
columns = [col for col in meta]
|
|
383
|
+
for row in results:
|
|
384
|
+
record = dict(zip(columns, row))
|
|
385
|
+
G.add_node(record['name'])
|
|
386
|
+
|
|
387
|
+
# Add edges (collaboration relationships)
|
|
388
|
+
results, meta = db.cypher_query(collaboration_query)
|
|
389
|
+
# Convert results to dictionary format
|
|
390
|
+
columns = [col for col in meta]
|
|
391
|
+
for row in results:
|
|
392
|
+
record = dict(zip(columns, row))
|
|
393
|
+
G.add_edge(
|
|
394
|
+
record['source'],
|
|
395
|
+
record['target'],
|
|
396
|
+
weight=record['weight']
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
return G
|
|
400
|
+
|
|
401
|
+
def extract_keyword_co_occurrence_network(self) -> nx.Graph:
|
|
402
|
+
"""Extract keyword co-occurrence network from Neo4j.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
NetworkX graph representing the keyword co-occurrence network
|
|
406
|
+
"""
|
|
407
|
+
# Create an empty undirected graph
|
|
408
|
+
G = nx.Graph()
|
|
409
|
+
|
|
410
|
+
# Cypher query to get keywords
|
|
411
|
+
keyword_query = """
|
|
412
|
+
MATCH (k:Keyword)
|
|
413
|
+
RETURN k.name AS name
|
|
414
|
+
"""
|
|
415
|
+
|
|
416
|
+
# Cypher query to get co-occurrence relationships
|
|
417
|
+
cooccurrence_query = """
|
|
418
|
+
MATCH (k1:Keyword)<-[:HAS_KEYWORD]-(p:Paper)-[:HAS_KEYWORD]->(k2:Keyword)
|
|
419
|
+
WHERE k1 <> k2
|
|
420
|
+
WITH k1, k2, COUNT(p) AS cooccurrence_count
|
|
421
|
+
RETURN k1.name AS source, k2.name AS target, cooccurrence_count AS weight
|
|
422
|
+
"""
|
|
423
|
+
|
|
424
|
+
# Add nodes (keywords)
|
|
425
|
+
results, meta = db.cypher_query(keyword_query)
|
|
426
|
+
# Convert results to dictionary format
|
|
427
|
+
columns = [col for col in meta]
|
|
428
|
+
for row in results:
|
|
429
|
+
record = dict(zip(columns, row))
|
|
430
|
+
G.add_node(record['name'])
|
|
431
|
+
|
|
432
|
+
# Add edges (co-occurrence relationships)
|
|
433
|
+
results, meta = db.cypher_query(cooccurrence_query)
|
|
434
|
+
# Convert results to dictionary format
|
|
435
|
+
columns = [col for col in meta]
|
|
436
|
+
for row in results:
|
|
437
|
+
record = dict(zip(columns, row))
|
|
438
|
+
G.add_edge(
|
|
439
|
+
record['source'],
|
|
440
|
+
record['target'],
|
|
441
|
+
weight=record['weight']
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
return G
|
|
445
|
+
|
|
446
|
+
def export_graph_to_graphml(self, G: nx.Graph, filepath: str) -> None:
|
|
447
|
+
"""Export a NetworkX graph to GraphML format.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
G: NetworkX graph to export
|
|
451
|
+
filepath: Path to save the GraphML file
|
|
452
|
+
"""
|
|
453
|
+
nx.write_graphml(G, filepath)
|
|
454
|
+
|
|
455
|
+
def export_graph_to_csv(self, G: nx.Graph, nodes_filepath: str, edges_filepath: str) -> None:
|
|
456
|
+
"""Export a NetworkX graph to CSV format (nodes and edges files).
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
G: NetworkX graph to export
|
|
460
|
+
nodes_filepath: Path to save the nodes CSV file
|
|
461
|
+
edges_filepath: Path to save the edges CSV file
|
|
462
|
+
"""
|
|
463
|
+
# Export nodes
|
|
464
|
+
with open(nodes_filepath, 'w', encoding='utf-8') as f:
|
|
465
|
+
# Write header
|
|
466
|
+
f.write('id,')
|
|
467
|
+
# Get all possible attributes from nodes
|
|
468
|
+
attrs = set()
|
|
469
|
+
for _, attr in G.nodes(data=True):
|
|
470
|
+
attrs.update(attr.keys())
|
|
471
|
+
f.write(','.join(attrs))
|
|
472
|
+
f.write('\n')
|
|
473
|
+
|
|
474
|
+
# Write node data
|
|
475
|
+
for node, attr in G.nodes(data=True):
|
|
476
|
+
f.write(f'"{node}",')
|
|
477
|
+
f.write(','.join([f'"{attr.get(a, "")}"' for a in attrs]))
|
|
478
|
+
f.write('\n')
|
|
479
|
+
|
|
480
|
+
# Export edges
|
|
481
|
+
with open(edges_filepath, 'w', encoding='utf-8') as f:
|
|
482
|
+
# Write header
|
|
483
|
+
f.write('source,target,')
|
|
484
|
+
# Get all possible attributes from edges
|
|
485
|
+
attrs = set()
|
|
486
|
+
for _, _, attr in G.edges(data=True):
|
|
487
|
+
attrs.update(attr.keys())
|
|
488
|
+
f.write(','.join(attrs))
|
|
489
|
+
f.write('\n')
|
|
490
|
+
|
|
491
|
+
# Write edge data
|
|
492
|
+
for source, target, attr in G.edges(data=True):
|
|
493
|
+
f.write(f'"{source}","{target}",')
|
|
494
|
+
f.write(','.join([f'"{attr.get(a, "")}"' for a in attrs]))
|
|
495
|
+
f.write('\n')
|
|
496
|
+
|
|
497
|
+
def calculate_network_metrics(self, G: nx.Graph) -> Dict[str, Any]:
|
|
498
|
+
"""Calculate various network metrics for a graph.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
G: NetworkX graph to analyze
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
Dictionary of network metrics
|
|
505
|
+
"""
|
|
506
|
+
metrics = {}
|
|
507
|
+
|
|
508
|
+
# Basic metrics
|
|
509
|
+
metrics['node_count'] = G.number_of_nodes()
|
|
510
|
+
metrics['edge_count'] = G.number_of_edges()
|
|
511
|
+
metrics['density'] = nx.density(G)
|
|
512
|
+
|
|
513
|
+
# Connected components
|
|
514
|
+
if not nx.is_connected(G):
|
|
515
|
+
components = list(nx.connected_components(G))
|
|
516
|
+
metrics['connected_components'] = len(components)
|
|
517
|
+
metrics['largest_component_size'] = len(max(components, key=len))
|
|
518
|
+
else:
|
|
519
|
+
metrics['connected_components'] = 1
|
|
520
|
+
metrics['largest_component_size'] = G.number_of_nodes()
|
|
521
|
+
|
|
522
|
+
# Centrality measures (for the largest component to avoid errors)
|
|
523
|
+
largest_cc = max(nx.connected_components(G), key=len)
|
|
524
|
+
largest_subgraph = G.subgraph(largest_cc).copy()
|
|
525
|
+
|
|
526
|
+
# Degree centrality
|
|
527
|
+
degree_centrality = nx.degree_centrality(largest_subgraph)
|
|
528
|
+
metrics['max_degree_centrality'] = max(degree_centrality.values()) if degree_centrality else 0
|
|
529
|
+
metrics['avg_degree_centrality'] = sum(degree_centrality.values()) / len(degree_centrality) if degree_centrality else 0
|
|
530
|
+
|
|
531
|
+
# Betweenness centrality (can be slow for large networks)
|
|
532
|
+
if largest_subgraph.number_of_nodes() < 1000: # Only calculate for smaller networks
|
|
533
|
+
betweenness_centrality = nx.betweenness_centrality(largest_subgraph)
|
|
534
|
+
metrics['max_betweenness_centrality'] = max(betweenness_centrality.values()) if betweenness_centrality else 0
|
|
535
|
+
metrics['avg_betweenness_centrality'] = sum(betweenness_centrality.values()) / len(betweenness_centrality) if betweenness_centrality else 0
|
|
536
|
+
|
|
537
|
+
# Clustering coefficient
|
|
538
|
+
metrics['avg_clustering_coefficient'] = nx.average_clustering(largest_subgraph)
|
|
539
|
+
|
|
540
|
+
return metrics
|
|
541
|
+
|
|
542
|
+
def detect_communities(self, G: nx.Graph, algorithm: str = 'louvain') -> Tuple[Dict[Any, int], float]:
|
|
543
|
+
"""Detect communities in a graph using various algorithms.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
G: NetworkX graph to analyze
|
|
547
|
+
algorithm: Community detection algorithm to use ('louvain', 'label_propagation', 'greedy_modularity')
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
Tuple of (community assignments, modularity score)
|
|
551
|
+
"""
|
|
552
|
+
if algorithm == 'louvain':
|
|
553
|
+
try:
|
|
554
|
+
import community as community_louvain
|
|
555
|
+
partition = community_louvain.best_partition(G)
|
|
556
|
+
modularity = community_louvain.modularity(partition, G)
|
|
557
|
+
return partition, modularity
|
|
558
|
+
except ImportError:
|
|
559
|
+
print("python-louvain package not installed. Falling back to greedy modularity.")
|
|
560
|
+
algorithm = 'greedy_modularity'
|
|
561
|
+
|
|
562
|
+
if algorithm == 'label_propagation':
|
|
563
|
+
try:
|
|
564
|
+
from networkx.algorithms import community
|
|
565
|
+
communities = community.label_propagation_communities(G)
|
|
566
|
+
# Convert to dictionary format
|
|
567
|
+
partition = {}
|
|
568
|
+
for i, comm in enumerate(communities):
|
|
569
|
+
for node in comm:
|
|
570
|
+
partition[node] = i
|
|
571
|
+
# Calculate modularity
|
|
572
|
+
modularity = community.modularity(G, communities)
|
|
573
|
+
return partition, modularity
|
|
574
|
+
except ImportError:
|
|
575
|
+
print("NetworkX community algorithms not available. Falling back to greedy modularity.")
|
|
576
|
+
algorithm = 'greedy_modularity'
|
|
577
|
+
|
|
578
|
+
if algorithm == 'greedy_modularity':
|
|
579
|
+
try:
|
|
580
|
+
from networkx.algorithms import community
|
|
581
|
+
communities = community.greedy_modularity_communities(G)
|
|
582
|
+
# Convert to dictionary format
|
|
583
|
+
partition = {}
|
|
584
|
+
for i, comm in enumerate(communities):
|
|
585
|
+
for node in comm:
|
|
586
|
+
partition[node] = i
|
|
587
|
+
# Calculate modularity
|
|
588
|
+
modularity = community.modularity(G, communities)
|
|
589
|
+
return partition, modularity
|
|
590
|
+
except ImportError:
|
|
591
|
+
print("NetworkX community algorithms not available.")
|
|
592
|
+
return {node: 0 for node in G.nodes()}, 0.0
|
|
593
|
+
|
|
594
|
+
# Default fallback
|
|
595
|
+
return {node: 0 for node in G.nodes()}, 0.0
|
|
596
|
+
|
|
597
|
+
# Example usage
|
|
598
|
+
if __name__ == "__main__":
|
|
599
|
+
analyzer = BibliometricNetworkAnalyzer()
|
|
600
|
+
|
|
601
|
+
# Create co-citation relationships in Neo4j
|
|
602
|
+
rel_count = analyzer.create_co_citation_relationships()
|
|
603
|
+
print(f"Created {rel_count} CO_CITED_WITH relationships")
|
|
604
|
+
|
|
605
|
+
# Extract co-citation network with quality report
|
|
606
|
+
cocitation_network, quality_report = analyzer.extract_co_citation_network(min_weight=1)
|
|
607
|
+
print(f"Co-citation network has {cocitation_network.number_of_nodes()} nodes and {cocitation_network.number_of_edges()} edges")
|
|
608
|
+
|
|
609
|
+
# Display quality report
|
|
610
|
+
print("\nQuality Report for Co-citation Network:")
|
|
611
|
+
print(f" Document count: {quality_report['document_count']} (Threshold: ≥200, Met: {quality_report['meets_volume_threshold']})")
|
|
612
|
+
print(f" DOI and references: {quality_report['doi_ref_percentage']:.2f}% (Threshold: ≥90%, Met: {quality_report['meets_doi_ref_threshold']})")
|
|
613
|
+
print(f" Temporal coverage: {quality_report['temporal_coverage']} (Threshold: 2000-2024, Met: {quality_report['meets_temporal_threshold']})")
|
|
614
|
+
print(f" Geographic diversity: {quality_report['country_count']} countries (Threshold: ≥5, Met: {quality_report['meets_geographic_threshold']})")
|
|
615
|
+
print(f" Key authors: {quality_report['recurring_authors']} recurring authors (Threshold: ≥10, Met: {quality_report['meets_author_threshold']})")
|
|
616
|
+
print(f" Source duplication: {quality_report['source_duplication_percentage']:.2f}%")
|
|
617
|
+
|
|
618
|
+
# Display missing data percentages
|
|
619
|
+
print(" Missing data percentages:")
|
|
620
|
+
for field, percentage in quality_report['missing_data_percentages'].items():
|
|
621
|
+
print(f" {field}: {percentage:.2f}%")
|
|
622
|
+
|
|
623
|
+
# Display overall quality score
|
|
624
|
+
print(f" Overall quality score: {quality_report['quality_score']:.2f}% ({quality_report['criteria_met_count']}/{quality_report['criteria_total_count']} criteria met)")
|
|
625
|
+
|
|
626
|
+
# Display top authors if available
|
|
627
|
+
if quality_report.get('top_authors'):
|
|
628
|
+
print(" Top authors:")
|
|
629
|
+
for author in quality_report['top_authors']:
|
|
630
|
+
print(f" {author['name']}: {author['paper_count']} papers")
|
|
631
|
+
|
|
632
|
+
# Export the network
|
|
633
|
+
os.makedirs("output", exist_ok=True)
|
|
634
|
+
analyzer.export_graph_to_graphml(cocitation_network, "output/cocitation_network.graphml")
|
|
635
|
+
analyzer.export_graph_to_csv(cocitation_network, "output/cocitation_nodes.csv", "output/cocitation_edges.csv")
|
|
636
|
+
|
|
637
|
+
# Calculate network metrics
|
|
638
|
+
metrics = analyzer.calculate_network_metrics(cocitation_network)
|
|
639
|
+
print("\nNetwork metrics:")
|
|
640
|
+
for key, value in metrics.items():
|
|
641
|
+
print(f" {key}: {value}")
|
|
642
|
+
|
|
643
|
+
# Detect communities
|
|
644
|
+
communities, modularity = analyzer.detect_communities(cocitation_network)
|
|
645
|
+
print(f"Detected {len(set(communities.values()))} communities with modularity {modularity:.4f}")
|