bib2graph 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bib2graph/__init__.py ADDED
@@ -0,0 +1,20 @@
1
+ """
2
+ bib2graph - A Python package for bibliometric data processing and network analysis.
3
+
4
+ This package provides tools for:
5
+ 1. Ingesting bibliographic data from various sources
6
+ 2. Enriching data with additional metadata from external APIs
7
+ 3. Extracting and analyzing bibliometric networks
8
+
9
+ Main components:
10
+ - BibliometricDataLoader: For loading and normalizing bibliographic data
11
+ - BibliometricDataEnricher: For enriching data with additional metadata
12
+ - BibliometricNetworkAnalyzer: For extracting and analyzing networks
13
+ """
14
+
15
+ from bib2graph.consigue_los_articulos import BibliometricDataLoader
16
+ from bib2graph.enriquecimiento import BibliometricDataEnricher
17
+ from bib2graph.analisis_red import BibliometricNetworkAnalyzer
18
+ from bib2graph.models import *
19
+
20
+ __version__ = "0.1.0"
@@ -0,0 +1,645 @@
1
+ """
2
+ Network analysis module for bibliometric analysis of semiconductor supply chain.
3
+
4
+ This module provides functionality for extracting co-citation networks from the Neo4j database
5
+ and exporting them for analysis in external tools.
6
+ """
7
+
8
+ import os
9
+ import networkx as nx
10
+ from neomodel import db, config
11
+ from typing import Dict, Any, Tuple, Set
12
+
13
+ # Neo4j connection parameters
14
+ NEO4J_URI = "bolt://localhost:7687"
15
+ NEO4J_USER = "neo4j"
16
+ NEO4J_PASSWORD = "password" # Change this in production
17
+
18
+ # Configure neomodel connection
19
+ config.DATABASE_URL = f"bolt://{NEO4J_USER}:{NEO4J_PASSWORD}@localhost:7687"
20
+
21
+ class BibliometricNetworkAnalyzer:
22
+ """Class for analyzing bibliometric networks from Neo4j database."""
23
+
24
+ def __init__(self, uri: str = NEO4J_URI, user: str = NEO4J_USER, password: str = NEO4J_PASSWORD):
25
+ """Initialize the network analyzer with Neo4j connection parameters.
26
+
27
+ Args:
28
+ uri: Neo4j connection URI
29
+ user: Neo4j username
30
+ password: Neo4j password
31
+ """
32
+ # Configure neomodel connection
33
+ config.DATABASE_URL = f"bolt://{user}:{password}@{uri.replace('bolt://', '')}"
34
+
35
+ def create_co_citation_relationships(self) -> int:
36
+ """Create CO_CITED_WITH relationships in Neo4j based on shared references.
37
+
38
+ Returns:
39
+ Number of CO_CITED_WITH relationships created
40
+ """
41
+ cypher_query = """
42
+ MATCH (p1:Paper {is_seed: True})-[:REFERENCES]->(ref:Paper)<-[:REFERENCES]-(p2:Paper {is_seed: True})
43
+ WHERE p1 <> p2
44
+ WITH p1, p2, COUNT(ref) AS shared_refs
45
+ WHERE shared_refs > 0
46
+ MERGE (p1)-[r:CO_CITED_WITH]-(p2)
47
+ ON CREATE SET r.weight = shared_refs
48
+ RETURN COUNT(r) AS relationship_count
49
+ """
50
+
51
+ results, meta = db.cypher_query(cypher_query)
52
+ return results[0][0] if results else 0
53
+
54
+ def generate_quality_report(self, dois_set: Set[str]) -> Dict[str, Any]:
55
+ """Generate a quality report for the co-citation network.
56
+
57
+ Args:
58
+ dois_set: Set of DOIs involved in the co-citation network
59
+
60
+ Returns:
61
+ Dictionary containing quality metrics
62
+ """
63
+ report = {}
64
+
65
+ # 1. Document volume check
66
+ report["document_count"] = len(dois_set)
67
+ report["meets_volume_threshold"] = report["document_count"] >= 200
68
+
69
+ # 2. DOI and references percentage
70
+ if dois_set:
71
+ doi_ref_query = """
72
+ MATCH (p:Paper)
73
+ WHERE p.doi IN $dois
74
+ RETURN COUNT(p) AS total,
75
+ SUM(
76
+ CASE
77
+ WHEN p.doi IS NOT NULL AND EXISTS { (p)-[:REFERENCES]->() }
78
+ THEN 1
79
+ ELSE 0
80
+ END
81
+ ) AS with_doi_and_refs
82
+ """
83
+ results, _ = db.cypher_query(doi_ref_query, {"dois": list(dois_set)})
84
+ total = results[0][0] if results and results[0][0] is not None else 0
85
+ with_doi_and_refs = results[0][1] if results and results[0][1] is not None else 0
86
+
87
+ if total > 0:
88
+ report["doi_ref_percentage"] = (with_doi_and_refs / total) * 100
89
+ else:
90
+ report["doi_ref_percentage"] = 0
91
+
92
+ report["meets_doi_ref_threshold"] = report["doi_ref_percentage"] >= 90
93
+ else:
94
+ report["doi_ref_percentage"] = 0
95
+ report["meets_doi_ref_threshold"] = False
96
+
97
+ # 3. Temporal coverage
98
+ if dois_set:
99
+ year_query = """
100
+ MATCH (p:Paper)
101
+ WHERE p.doi IN $dois AND p.year IS NOT NULL
102
+ RETURN MIN(toInteger(p.year)) AS min_year,
103
+ MAX(toInteger(p.year)) AS max_year,
104
+ COUNT(DISTINCT p.year) AS unique_years
105
+ """
106
+ results, _ = db.cypher_query(year_query, {"dois": list(dois_set)})
107
+
108
+ if results and results[0][0] is not None:
109
+ report["min_year"] = results[0][0]
110
+ report["max_year"] = results[0][1]
111
+ report["unique_years"] = results[0][2]
112
+ report["temporal_coverage"] = f"{report['min_year']}–{report['max_year']}"
113
+ # Check if coverage includes 2000-2024
114
+ report["meets_temporal_threshold"] = (
115
+ report["min_year"] <= 2000 and report["max_year"] >= 2024
116
+ )
117
+ else:
118
+ report["temporal_coverage"] = "No data"
119
+ report["meets_temporal_threshold"] = False
120
+ else:
121
+ report["temporal_coverage"] = "No data"
122
+ report["meets_temporal_threshold"] = False
123
+
124
+ # 4. Geographic diversity
125
+ if dois_set:
126
+ country_query = """
127
+ MATCH (p:Paper)-[:AUTHORED]->(a:Author)-[:AFFILIATED_WITH]->(i:Institution)
128
+ WHERE p.doi IN $dois
129
+ RETURN COUNT(DISTINCT i.address) AS country_count
130
+ """
131
+ results, _ = db.cypher_query(country_query, {"dois": list(dois_set)})
132
+
133
+ report["country_count"] = results[0][0] if results and results[0][0] is not None else 0
134
+ report["meets_geographic_threshold"] = report["country_count"] >= 5
135
+ else:
136
+ report["country_count"] = 0
137
+ report["meets_geographic_threshold"] = False
138
+
139
+ # 5. Key author participation
140
+ if dois_set:
141
+ author_query = """
142
+ MATCH (p:Paper)-[:AUTHORED]->(a:Author)
143
+ WHERE p.doi IN $dois
144
+ WITH a, COUNT(p) AS paper_count
145
+ WHERE paper_count > 1
146
+ RETURN COUNT(a) AS recurring_authors
147
+ """
148
+ results, _ = db.cypher_query(author_query, {"dois": list(dois_set)})
149
+
150
+ report["recurring_authors"] = results[0][0] if results and results[0][0] is not None else 0
151
+ report["meets_author_threshold"] = report["recurring_authors"] >= 10
152
+
153
+ # Get top authors for the report
154
+ top_authors_query = """
155
+ MATCH (p:Paper)-[:AUTHORED]->(a:Author)
156
+ WHERE p.doi IN $dois
157
+ WITH a, COUNT(p) AS paper_count
158
+ ORDER BY paper_count DESC
159
+ LIMIT 10
160
+ RETURN a.name AS author_name, paper_count
161
+ """
162
+ results, meta = db.cypher_query(top_authors_query, {"dois": list(dois_set)})
163
+ columns = [col for col in meta]
164
+
165
+ top_authors = []
166
+ for row in results:
167
+ record = dict(zip(columns, row))
168
+ top_authors.append({
169
+ "name": record["author_name"],
170
+ "paper_count": record["paper_count"]
171
+ })
172
+
173
+ report["top_authors"] = top_authors
174
+ else:
175
+ report["recurring_authors"] = 0
176
+ report["meets_author_threshold"] = False
177
+ report["top_authors"] = []
178
+
179
+ # 6. Source duplication level
180
+ if dois_set:
181
+ source_query = """
182
+ MATCH (p:Paper)
183
+ WHERE p.doi IN $dois
184
+ RETURN COUNT(p) AS total,
185
+ COUNT(DISTINCT p.source) AS unique_sources
186
+ """
187
+ results, _ = db.cypher_query(source_query, {"dois": list(dois_set)})
188
+
189
+ total = results[0][0] if results and results[0][0] is not None else 0
190
+ unique_sources = results[0][1] if results and results[0][1] is not None else 0
191
+
192
+ if total > 0:
193
+ report["source_duplication_percentage"] = ((total - unique_sources) / total) * 100
194
+ else:
195
+ report["source_duplication_percentage"] = 0
196
+ else:
197
+ report["source_duplication_percentage"] = 0
198
+
199
+ # 7. Missing data quality
200
+ if dois_set:
201
+ missing_data_query = """
202
+ MATCH (p:Paper)
203
+ WHERE p.doi IN $dois
204
+ RETURN
205
+ COUNT(p) AS total,
206
+ SUM(CASE WHEN p.title IS NULL THEN 1 ELSE 0 END) AS missing_title,
207
+ SUM(CASE WHEN p.year IS NULL THEN 1 ELSE 0 END) AS missing_year,
208
+ SUM(CASE WHEN p.abstract IS NULL THEN 1 ELSE 0 END) AS missing_abstract,
209
+ SUM(CASE WHEN NOT EXISTS((p)-[:AUTHORED]->()) THEN 1 ELSE 0 END) AS missing_authors,
210
+ SUM(CASE WHEN NOT EXISTS((p)-[:HAS_KEYWORD]->()) THEN 1 ELSE 0 END) AS missing_keywords
211
+ """
212
+ results, meta = db.cypher_query(missing_data_query, {"dois": list(dois_set)})
213
+ columns = [col for col in meta]
214
+
215
+ if results:
216
+ record = dict(zip(columns, results[0]))
217
+ total = record["total"] if record["total"] is not None else 0
218
+
219
+ missing_data = {}
220
+ for field in ["title", "year", "abstract", "authors", "keywords"]:
221
+ field_key = f"missing_{field}"
222
+ if total > 0 and record[field_key] is not None:
223
+ missing_data[field] = (record[field_key] / total) * 100
224
+ else:
225
+ missing_data[field] = 0
226
+
227
+ report["missing_data_percentages"] = missing_data
228
+ else:
229
+ report["missing_data_percentages"] = {
230
+ "title": 0, "year": 0, "abstract": 0, "authors": 0, "keywords": 0
231
+ }
232
+ else:
233
+ report["missing_data_percentages"] = {
234
+ "title": 0, "year": 0, "abstract": 0, "authors": 0, "keywords": 0
235
+ }
236
+
237
+ # Overall quality assessment
238
+ criteria_met = [
239
+ report["meets_volume_threshold"],
240
+ report["meets_doi_ref_threshold"],
241
+ report["meets_temporal_threshold"],
242
+ report["meets_geographic_threshold"],
243
+ report["meets_author_threshold"]
244
+ ]
245
+
246
+ report["criteria_met_count"] = sum(1 for c in criteria_met if c)
247
+ report["criteria_total_count"] = len(criteria_met)
248
+ report["quality_score"] = (report["criteria_met_count"] / report["criteria_total_count"]) * 100 if report["criteria_total_count"] > 0 else 0
249
+
250
+ return report
251
+
252
+ def extract_co_citation_network(self, min_weight: int = 1) -> Tuple[nx.Graph, Dict[str, Any]]:
253
+ """Extrae la red de cocitación a partir de Neo4j, sólo con papers con relaciones.
254
+
255
+ Args:
256
+ min_weight: Minimum weight for co-citation relationships
257
+
258
+ Returns:
259
+ Tuple containing:
260
+ - NetworkX graph representing the co-citation network
261
+ - Dictionary containing quality report metrics
262
+ """
263
+ G = nx.Graph()
264
+
265
+ # 1. Primero recuperamos solo las relaciones relevantes (edges)
266
+ cocitation_query = """
267
+ MATCH (p1:Paper)-[r:CO_CITED_WITH]-(p2:Paper)
268
+ WHERE r.weight >= $min_weight
269
+ RETURN p1.doi AS source, p2.doi AS target, r.weight AS weight
270
+ """
271
+ cocit_results, cocit_meta = db.cypher_query(cocitation_query, {"min_weight": min_weight})
272
+ columns = [col for col in cocit_meta]
273
+ # Guardar el set de DOIs involucrados en cocitación
274
+ dois_set = set()
275
+ edges = []
276
+ for row in cocit_results:
277
+ record = dict(zip(columns, row))
278
+ if record['source'] and record['target'] and record['weight'] is not None:
279
+ # Aseguramos que no haya None en weight
280
+ edges.append((record['source'], record['target'], record['weight']))
281
+ dois_set.add(record['source'])
282
+ dois_set.add(record['target'])
283
+
284
+ if dois_set:
285
+ paper_query = """
286
+ MATCH (p:Paper)
287
+ WHERE p.doi IN $dois
288
+ RETURN p.doi AS doi, p.title AS title, p.year AS year
289
+ """
290
+ paper_results, paper_meta = db.cypher_query(paper_query, {"dois": list(dois_set)})
291
+ col_paper = [col for col in paper_meta]
292
+ for row in paper_results:
293
+ record = dict(zip(col_paper, row))
294
+ # Limpiar/asegurar que fields no sean None (GraphML NO soporta None)
295
+ doi = record['doi']
296
+ title = record['title'] if record['title'] is not None else ""
297
+ year = record['year'] if record['year'] is not None else -1
298
+ G.add_node(doi, title=title, year=year)
299
+
300
+ for source, target, weight in edges:
301
+ G.add_edge(source, target, weight=weight)
302
+
303
+ # Generate quality report
304
+ quality_report = self.generate_quality_report(dois_set)
305
+
306
+ return G, quality_report
307
+
308
+ def extract_author_collaboration_network(self) -> nx.Graph:
309
+ """Extract author collaboration network from Neo4j.
310
+
311
+ Returns:
312
+ NetworkX graph representing the author collaboration network
313
+ """
314
+ # Create an empty undirected graph
315
+ G = nx.Graph()
316
+
317
+ # Cypher query to get authors
318
+ author_query = """
319
+ MATCH (a:Author)
320
+ RETURN a.name AS name, a.orcid AS orcid
321
+ """
322
+
323
+ # Cypher query to get collaboration relationships
324
+ collaboration_query = """
325
+ MATCH (a1:Author)-[:AUTHORED]->(p:Paper)<-[:AUTHORED]-(a2:Author)
326
+ WHERE a1 <> a2
327
+ WITH a1, a2, COUNT(p) AS collaboration_count
328
+ RETURN a1.name AS source, a2.name AS target, collaboration_count AS weight
329
+ """
330
+
331
+ # Add nodes (authors)
332
+ results, meta = db.cypher_query(author_query)
333
+ # Convert results to dictionary format
334
+ columns = [col for col in meta]
335
+ for row in results:
336
+ record = dict(zip(columns, row))
337
+ G.add_node(
338
+ record['name'],
339
+ orcid=record['orcid']
340
+ )
341
+
342
+ # Add edges (collaboration relationships)
343
+ results, meta = db.cypher_query(collaboration_query)
344
+ # Convert results to dictionary format
345
+ columns = [col for col in meta]
346
+ for row in results:
347
+ record = dict(zip(columns, row))
348
+ G.add_edge(
349
+ record['source'],
350
+ record['target'],
351
+ weight=record['weight']
352
+ )
353
+
354
+ return G
355
+
356
+ def extract_institution_collaboration_network(self) -> nx.Graph:
357
+ """Extract institution collaboration network from Neo4j.
358
+
359
+ Returns:
360
+ NetworkX graph representing the institution collaboration network
361
+ """
362
+ # Create an empty undirected graph
363
+ G = nx.Graph()
364
+
365
+ # Cypher query to get institutions
366
+ institution_query = """
367
+ MATCH (i:Institution)
368
+ RETURN i.name AS name
369
+ """
370
+
371
+ # Cypher query to get collaboration relationships
372
+ collaboration_query = """
373
+ MATCH (i1:Institution)<-[:AFFILIATED_WITH]-(a1:Author)-[:AUTHORED]->(p:Paper)<-[:AUTHORED]-(a2:Author)-[:AFFILIATED_WITH]->(i2:Institution)
374
+ WHERE i1 <> i2
375
+ WITH i1, i2, COUNT(DISTINCT p) AS collaboration_count
376
+ RETURN i1.name AS source, i2.name AS target, collaboration_count AS weight
377
+ """
378
+
379
+ # Add nodes (institutions)
380
+ results, meta = db.cypher_query(institution_query)
381
+ # Convert results to dictionary format
382
+ columns = [col for col in meta]
383
+ for row in results:
384
+ record = dict(zip(columns, row))
385
+ G.add_node(record['name'])
386
+
387
+ # Add edges (collaboration relationships)
388
+ results, meta = db.cypher_query(collaboration_query)
389
+ # Convert results to dictionary format
390
+ columns = [col for col in meta]
391
+ for row in results:
392
+ record = dict(zip(columns, row))
393
+ G.add_edge(
394
+ record['source'],
395
+ record['target'],
396
+ weight=record['weight']
397
+ )
398
+
399
+ return G
400
+
401
+ def extract_keyword_co_occurrence_network(self) -> nx.Graph:
402
+ """Extract keyword co-occurrence network from Neo4j.
403
+
404
+ Returns:
405
+ NetworkX graph representing the keyword co-occurrence network
406
+ """
407
+ # Create an empty undirected graph
408
+ G = nx.Graph()
409
+
410
+ # Cypher query to get keywords
411
+ keyword_query = """
412
+ MATCH (k:Keyword)
413
+ RETURN k.name AS name
414
+ """
415
+
416
+ # Cypher query to get co-occurrence relationships
417
+ cooccurrence_query = """
418
+ MATCH (k1:Keyword)<-[:HAS_KEYWORD]-(p:Paper)-[:HAS_KEYWORD]->(k2:Keyword)
419
+ WHERE k1 <> k2
420
+ WITH k1, k2, COUNT(p) AS cooccurrence_count
421
+ RETURN k1.name AS source, k2.name AS target, cooccurrence_count AS weight
422
+ """
423
+
424
+ # Add nodes (keywords)
425
+ results, meta = db.cypher_query(keyword_query)
426
+ # Convert results to dictionary format
427
+ columns = [col for col in meta]
428
+ for row in results:
429
+ record = dict(zip(columns, row))
430
+ G.add_node(record['name'])
431
+
432
+ # Add edges (co-occurrence relationships)
433
+ results, meta = db.cypher_query(cooccurrence_query)
434
+ # Convert results to dictionary format
435
+ columns = [col for col in meta]
436
+ for row in results:
437
+ record = dict(zip(columns, row))
438
+ G.add_edge(
439
+ record['source'],
440
+ record['target'],
441
+ weight=record['weight']
442
+ )
443
+
444
+ return G
445
+
446
+ def export_graph_to_graphml(self, G: nx.Graph, filepath: str) -> None:
447
+ """Export a NetworkX graph to GraphML format.
448
+
449
+ Args:
450
+ G: NetworkX graph to export
451
+ filepath: Path to save the GraphML file
452
+ """
453
+ nx.write_graphml(G, filepath)
454
+
455
+ def export_graph_to_csv(self, G: nx.Graph, nodes_filepath: str, edges_filepath: str) -> None:
456
+ """Export a NetworkX graph to CSV format (nodes and edges files).
457
+
458
+ Args:
459
+ G: NetworkX graph to export
460
+ nodes_filepath: Path to save the nodes CSV file
461
+ edges_filepath: Path to save the edges CSV file
462
+ """
463
+ # Export nodes
464
+ with open(nodes_filepath, 'w', encoding='utf-8') as f:
465
+ # Write header
466
+ f.write('id,')
467
+ # Get all possible attributes from nodes
468
+ attrs = set()
469
+ for _, attr in G.nodes(data=True):
470
+ attrs.update(attr.keys())
471
+ f.write(','.join(attrs))
472
+ f.write('\n')
473
+
474
+ # Write node data
475
+ for node, attr in G.nodes(data=True):
476
+ f.write(f'"{node}",')
477
+ f.write(','.join([f'"{attr.get(a, "")}"' for a in attrs]))
478
+ f.write('\n')
479
+
480
+ # Export edges
481
+ with open(edges_filepath, 'w', encoding='utf-8') as f:
482
+ # Write header
483
+ f.write('source,target,')
484
+ # Get all possible attributes from edges
485
+ attrs = set()
486
+ for _, _, attr in G.edges(data=True):
487
+ attrs.update(attr.keys())
488
+ f.write(','.join(attrs))
489
+ f.write('\n')
490
+
491
+ # Write edge data
492
+ for source, target, attr in G.edges(data=True):
493
+ f.write(f'"{source}","{target}",')
494
+ f.write(','.join([f'"{attr.get(a, "")}"' for a in attrs]))
495
+ f.write('\n')
496
+
497
+ def calculate_network_metrics(self, G: nx.Graph) -> Dict[str, Any]:
498
+ """Calculate various network metrics for a graph.
499
+
500
+ Args:
501
+ G: NetworkX graph to analyze
502
+
503
+ Returns:
504
+ Dictionary of network metrics
505
+ """
506
+ metrics = {}
507
+
508
+ # Basic metrics
509
+ metrics['node_count'] = G.number_of_nodes()
510
+ metrics['edge_count'] = G.number_of_edges()
511
+ metrics['density'] = nx.density(G)
512
+
513
+ # Connected components
514
+ if not nx.is_connected(G):
515
+ components = list(nx.connected_components(G))
516
+ metrics['connected_components'] = len(components)
517
+ metrics['largest_component_size'] = len(max(components, key=len))
518
+ else:
519
+ metrics['connected_components'] = 1
520
+ metrics['largest_component_size'] = G.number_of_nodes()
521
+
522
+ # Centrality measures (for the largest component to avoid errors)
523
+ largest_cc = max(nx.connected_components(G), key=len)
524
+ largest_subgraph = G.subgraph(largest_cc).copy()
525
+
526
+ # Degree centrality
527
+ degree_centrality = nx.degree_centrality(largest_subgraph)
528
+ metrics['max_degree_centrality'] = max(degree_centrality.values()) if degree_centrality else 0
529
+ metrics['avg_degree_centrality'] = sum(degree_centrality.values()) / len(degree_centrality) if degree_centrality else 0
530
+
531
+ # Betweenness centrality (can be slow for large networks)
532
+ if largest_subgraph.number_of_nodes() < 1000: # Only calculate for smaller networks
533
+ betweenness_centrality = nx.betweenness_centrality(largest_subgraph)
534
+ metrics['max_betweenness_centrality'] = max(betweenness_centrality.values()) if betweenness_centrality else 0
535
+ metrics['avg_betweenness_centrality'] = sum(betweenness_centrality.values()) / len(betweenness_centrality) if betweenness_centrality else 0
536
+
537
+ # Clustering coefficient
538
+ metrics['avg_clustering_coefficient'] = nx.average_clustering(largest_subgraph)
539
+
540
+ return metrics
541
+
542
+ def detect_communities(self, G: nx.Graph, algorithm: str = 'louvain') -> Tuple[Dict[Any, int], float]:
543
+ """Detect communities in a graph using various algorithms.
544
+
545
+ Args:
546
+ G: NetworkX graph to analyze
547
+ algorithm: Community detection algorithm to use ('louvain', 'label_propagation', 'greedy_modularity')
548
+
549
+ Returns:
550
+ Tuple of (community assignments, modularity score)
551
+ """
552
+ if algorithm == 'louvain':
553
+ try:
554
+ import community as community_louvain
555
+ partition = community_louvain.best_partition(G)
556
+ modularity = community_louvain.modularity(partition, G)
557
+ return partition, modularity
558
+ except ImportError:
559
+ print("python-louvain package not installed. Falling back to greedy modularity.")
560
+ algorithm = 'greedy_modularity'
561
+
562
+ if algorithm == 'label_propagation':
563
+ try:
564
+ from networkx.algorithms import community
565
+ communities = community.label_propagation_communities(G)
566
+ # Convert to dictionary format
567
+ partition = {}
568
+ for i, comm in enumerate(communities):
569
+ for node in comm:
570
+ partition[node] = i
571
+ # Calculate modularity
572
+ modularity = community.modularity(G, communities)
573
+ return partition, modularity
574
+ except ImportError:
575
+ print("NetworkX community algorithms not available. Falling back to greedy modularity.")
576
+ algorithm = 'greedy_modularity'
577
+
578
+ if algorithm == 'greedy_modularity':
579
+ try:
580
+ from networkx.algorithms import community
581
+ communities = community.greedy_modularity_communities(G)
582
+ # Convert to dictionary format
583
+ partition = {}
584
+ for i, comm in enumerate(communities):
585
+ for node in comm:
586
+ partition[node] = i
587
+ # Calculate modularity
588
+ modularity = community.modularity(G, communities)
589
+ return partition, modularity
590
+ except ImportError:
591
+ print("NetworkX community algorithms not available.")
592
+ return {node: 0 for node in G.nodes()}, 0.0
593
+
594
+ # Default fallback
595
+ return {node: 0 for node in G.nodes()}, 0.0
596
+
597
+ # Example usage
598
+ if __name__ == "__main__":
599
+ analyzer = BibliometricNetworkAnalyzer()
600
+
601
+ # Create co-citation relationships in Neo4j
602
+ rel_count = analyzer.create_co_citation_relationships()
603
+ print(f"Created {rel_count} CO_CITED_WITH relationships")
604
+
605
+ # Extract co-citation network with quality report
606
+ cocitation_network, quality_report = analyzer.extract_co_citation_network(min_weight=1)
607
+ print(f"Co-citation network has {cocitation_network.number_of_nodes()} nodes and {cocitation_network.number_of_edges()} edges")
608
+
609
+ # Display quality report
610
+ print("\nQuality Report for Co-citation Network:")
611
+ print(f" Document count: {quality_report['document_count']} (Threshold: ≥200, Met: {quality_report['meets_volume_threshold']})")
612
+ print(f" DOI and references: {quality_report['doi_ref_percentage']:.2f}% (Threshold: ≥90%, Met: {quality_report['meets_doi_ref_threshold']})")
613
+ print(f" Temporal coverage: {quality_report['temporal_coverage']} (Threshold: 2000-2024, Met: {quality_report['meets_temporal_threshold']})")
614
+ print(f" Geographic diversity: {quality_report['country_count']} countries (Threshold: ≥5, Met: {quality_report['meets_geographic_threshold']})")
615
+ print(f" Key authors: {quality_report['recurring_authors']} recurring authors (Threshold: ≥10, Met: {quality_report['meets_author_threshold']})")
616
+ print(f" Source duplication: {quality_report['source_duplication_percentage']:.2f}%")
617
+
618
+ # Display missing data percentages
619
+ print(" Missing data percentages:")
620
+ for field, percentage in quality_report['missing_data_percentages'].items():
621
+ print(f" {field}: {percentage:.2f}%")
622
+
623
+ # Display overall quality score
624
+ print(f" Overall quality score: {quality_report['quality_score']:.2f}% ({quality_report['criteria_met_count']}/{quality_report['criteria_total_count']} criteria met)")
625
+
626
+ # Display top authors if available
627
+ if quality_report.get('top_authors'):
628
+ print(" Top authors:")
629
+ for author in quality_report['top_authors']:
630
+ print(f" {author['name']}: {author['paper_count']} papers")
631
+
632
+ # Export the network
633
+ os.makedirs("output", exist_ok=True)
634
+ analyzer.export_graph_to_graphml(cocitation_network, "output/cocitation_network.graphml")
635
+ analyzer.export_graph_to_csv(cocitation_network, "output/cocitation_nodes.csv", "output/cocitation_edges.csv")
636
+
637
+ # Calculate network metrics
638
+ metrics = analyzer.calculate_network_metrics(cocitation_network)
639
+ print("\nNetwork metrics:")
640
+ for key, value in metrics.items():
641
+ print(f" {key}: {value}")
642
+
643
+ # Detect communities
644
+ communities, modularity = analyzer.detect_communities(cocitation_network)
645
+ print(f"Detected {len(set(communities.values()))} communities with modularity {modularity:.4f}")