@j0hanz/superfetch 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/README.md +327 -0
  2. package/dist/config/index.d.ts +30 -0
  3. package/dist/config/index.d.ts.map +1 -0
  4. package/dist/config/index.js +42 -0
  5. package/dist/config/index.js.map +1 -0
  6. package/dist/errors/app-error.d.ts +71 -0
  7. package/dist/errors/app-error.d.ts.map +1 -0
  8. package/dist/errors/app-error.js +103 -0
  9. package/dist/errors/app-error.js.map +1 -0
  10. package/dist/errors/index.d.ts +2 -0
  11. package/dist/errors/index.d.ts.map +1 -0
  12. package/dist/errors/index.js +2 -0
  13. package/dist/errors/index.js.map +1 -0
  14. package/dist/index.d.ts +3 -0
  15. package/dist/index.d.ts.map +1 -0
  16. package/dist/index.js +179 -0
  17. package/dist/index.js.map +1 -0
  18. package/dist/middleware/error-handler.d.ts +7 -0
  19. package/dist/middleware/error-handler.d.ts.map +1 -0
  20. package/dist/middleware/error-handler.js +37 -0
  21. package/dist/middleware/error-handler.js.map +1 -0
  22. package/dist/middleware/rate-limiter.d.ts +33 -0
  23. package/dist/middleware/rate-limiter.d.ts.map +1 -0
  24. package/dist/middleware/rate-limiter.js +100 -0
  25. package/dist/middleware/rate-limiter.js.map +1 -0
  26. package/dist/prompts/index.d.ts +6 -0
  27. package/dist/prompts/index.d.ts.map +1 -0
  28. package/dist/prompts/index.js +81 -0
  29. package/dist/prompts/index.js.map +1 -0
  30. package/dist/resources/index.d.ts +6 -0
  31. package/dist/resources/index.d.ts.map +1 -0
  32. package/dist/resources/index.js +44 -0
  33. package/dist/resources/index.js.map +1 -0
  34. package/dist/server.d.ts +8 -0
  35. package/dist/server.d.ts.map +1 -0
  36. package/dist/server.js +39 -0
  37. package/dist/server.js.map +1 -0
  38. package/dist/services/cache.d.ts +16 -0
  39. package/dist/services/cache.d.ts.map +1 -0
  40. package/dist/services/cache.js +63 -0
  41. package/dist/services/cache.js.map +1 -0
  42. package/dist/services/cache.service.d.ts +52 -0
  43. package/dist/services/cache.service.d.ts.map +1 -0
  44. package/dist/services/cache.service.js +113 -0
  45. package/dist/services/cache.service.js.map +1 -0
  46. package/dist/services/extractor.d.ts +32 -0
  47. package/dist/services/extractor.d.ts.map +1 -0
  48. package/dist/services/extractor.js +97 -0
  49. package/dist/services/extractor.js.map +1 -0
  50. package/dist/services/extractor.service.d.ts +18 -0
  51. package/dist/services/extractor.service.d.ts.map +1 -0
  52. package/dist/services/extractor.service.js +75 -0
  53. package/dist/services/extractor.service.js.map +1 -0
  54. package/dist/services/fetcher.d.ts +9 -0
  55. package/dist/services/fetcher.d.ts.map +1 -0
  56. package/dist/services/fetcher.js +100 -0
  57. package/dist/services/fetcher.js.map +1 -0
  58. package/dist/services/fetcher.service.d.ts +18 -0
  59. package/dist/services/fetcher.service.d.ts.map +1 -0
  60. package/dist/services/fetcher.service.js +122 -0
  61. package/dist/services/fetcher.service.js.map +1 -0
  62. package/dist/services/logger.d.ts +5 -0
  63. package/dist/services/logger.d.ts.map +1 -0
  64. package/dist/services/logger.js +48 -0
  65. package/dist/services/logger.js.map +1 -0
  66. package/dist/services/logger.service.d.ts +5 -0
  67. package/dist/services/logger.service.d.ts.map +1 -0
  68. package/dist/services/logger.service.js +57 -0
  69. package/dist/services/logger.service.js.map +1 -0
  70. package/dist/services/parser.d.ts +6 -0
  71. package/dist/services/parser.d.ts.map +1 -0
  72. package/dist/services/parser.js +152 -0
  73. package/dist/services/parser.js.map +1 -0
  74. package/dist/services/parser.service.d.ts +42 -0
  75. package/dist/services/parser.service.d.ts.map +1 -0
  76. package/dist/services/parser.service.js +209 -0
  77. package/dist/services/parser.service.js.map +1 -0
  78. package/dist/tools/handlers/fetch-links.tool.d.ts +20 -0
  79. package/dist/tools/handlers/fetch-links.tool.d.ts.map +1 -0
  80. package/dist/tools/handlers/fetch-links.tool.js +91 -0
  81. package/dist/tools/handlers/fetch-links.tool.js.map +1 -0
  82. package/dist/tools/handlers/fetch-markdown.tool.d.ts +17 -0
  83. package/dist/tools/handlers/fetch-markdown.tool.d.ts.map +1 -0
  84. package/dist/tools/handlers/fetch-markdown.tool.js +99 -0
  85. package/dist/tools/handlers/fetch-markdown.tool.js.map +1 -0
  86. package/dist/tools/handlers/fetch-url.tool.d.ts +17 -0
  87. package/dist/tools/handlers/fetch-url.tool.d.ts.map +1 -0
  88. package/dist/tools/handlers/fetch-url.tool.js +103 -0
  89. package/dist/tools/handlers/fetch-url.tool.js.map +1 -0
  90. package/dist/tools/index.d.ts +7 -0
  91. package/dist/tools/index.d.ts.map +1 -0
  92. package/dist/tools/index.js +83 -0
  93. package/dist/tools/index.js.map +1 -0
  94. package/dist/transformers/jsonl.transformer.d.ts +4 -0
  95. package/dist/transformers/jsonl.transformer.d.ts.map +1 -0
  96. package/dist/transformers/jsonl.transformer.js +42 -0
  97. package/dist/transformers/jsonl.transformer.js.map +1 -0
  98. package/dist/transformers/markdown.transformer.d.ts +4 -0
  99. package/dist/transformers/markdown.transformer.d.ts.map +1 -0
  100. package/dist/transformers/markdown.transformer.js +104 -0
  101. package/dist/transformers/markdown.transformer.js.map +1 -0
  102. package/dist/types/content.types.d.ts +63 -0
  103. package/dist/types/content.types.d.ts.map +1 -0
  104. package/dist/types/content.types.js +2 -0
  105. package/dist/types/content.types.js.map +1 -0
  106. package/dist/types/index.d.ts +3 -0
  107. package/dist/types/index.d.ts.map +1 -0
  108. package/dist/types/index.js +3 -0
  109. package/dist/types/index.js.map +1 -0
  110. package/dist/types/schemas.d.ts +22 -0
  111. package/dist/types/schemas.d.ts.map +1 -0
  112. package/dist/types/schemas.js +5 -0
  113. package/dist/types/schemas.js.map +1 -0
  114. package/dist/utils/sanitizer.d.ts +9 -0
  115. package/dist/utils/sanitizer.d.ts.map +1 -0
  116. package/dist/utils/sanitizer.js +19 -0
  117. package/dist/utils/sanitizer.js.map +1 -0
  118. package/dist/utils/url-validator.d.ts +10 -0
  119. package/dist/utils/url-validator.d.ts.map +1 -0
  120. package/dist/utils/url-validator.js +69 -0
  121. package/dist/utils/url-validator.js.map +1 -0
  122. package/package.json +80 -0
@@ -0,0 +1,63 @@
1
+ import NodeCache from 'node-cache';
2
+ import { config } from '../config/index.js';
3
+ const cache = new NodeCache({
4
+ stdTTL: config.cache.ttl,
5
+ checkperiod: Math.floor(config.cache.ttl / 10),
6
+ useClones: false,
7
+ maxKeys: config.cache.maxKeys,
8
+ });
9
+ const stats = { hits: 0, misses: 0, sets: 0 };
10
+ // 5MB default max content size for cache entries
11
+ const maxContentSize = 5242880;
12
+ export function createCacheKey(namespace, url) {
13
+ return `${namespace}:${url}`;
14
+ }
15
+ export function get(cacheKey) {
16
+ if (!config.cache.enabled)
17
+ return undefined;
18
+ // Use cache key directly - no need for cryptographic hashing
19
+ // node-cache handles arbitrary string keys efficiently
20
+ const entry = cache.get(cacheKey);
21
+ if (entry) {
22
+ stats.hits++;
23
+ return entry;
24
+ }
25
+ stats.misses++;
26
+ return undefined;
27
+ }
28
+ export function set(cacheKey, content) {
29
+ if (!config.cache.enabled)
30
+ return;
31
+ if (content.length > maxContentSize)
32
+ return;
33
+ const now = new Date();
34
+ const entry = {
35
+ url: cacheKey,
36
+ content,
37
+ fetchedAt: now.toISOString(),
38
+ expiresAt: new Date(now.getTime() + config.cache.ttl * 1000).toISOString(),
39
+ };
40
+ // Use cache key directly for better performance and debuggability
41
+ cache.set(cacheKey, entry);
42
+ stats.sets++;
43
+ }
44
+ export function clear() {
45
+ cache.flushAll();
46
+ }
47
+ export function getStats() {
48
+ const total = stats.hits + stats.misses;
49
+ const hitRate = total > 0 ? ((stats.hits / total) * 100).toFixed(2) : '0.00';
50
+ return {
51
+ size: cache.keys().length,
52
+ maxKeys: config.cache.maxKeys,
53
+ ttl: config.cache.ttl,
54
+ hits: stats.hits,
55
+ misses: stats.misses,
56
+ sets: stats.sets,
57
+ hitRate: `${hitRate}%`,
58
+ };
59
+ }
60
+ export function keys() {
61
+ return cache.keys();
62
+ }
63
+ //# sourceMappingURL=cache.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cache.js","sourceRoot":"","sources":["../../src/services/cache.ts"],"names":[],"mappings":"AAAA,OAAO,SAAS,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAG5C,MAAM,KAAK,GAAG,IAAI,SAAS,CAAC;IAC1B,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG;IACxB,WAAW,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,EAAE,CAAC;IAC9C,SAAS,EAAE,KAAK;IAChB,OAAO,EAAE,MAAM,CAAC,KAAK,CAAC,OAAO;CAC9B,CAAC,CAAC;AAEH,MAAM,KAAK,GAAG,EAAE,IAAI,EAAE,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC;AAC9C,iDAAiD;AACjD,MAAM,cAAc,GAAG,OAAO,CAAC;AAE/B,MAAM,UAAU,cAAc,CAAC,SAAiB,EAAE,GAAW;IAC3D,OAAO,GAAG,SAAS,IAAI,GAAG,EAAE,CAAC;AAC/B,CAAC;AAED,MAAM,UAAU,GAAG,CAAC,QAAgB;IAClC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO;QAAE,OAAO,SAAS,CAAC;IAE5C,6DAA6D;IAC7D,uDAAuD;IACvD,MAAM,KAAK,GAAG,KAAK,CAAC,GAAG,CAAa,QAAQ,CAAC,CAAC;IAC9C,IAAI,KAAK,EAAE,CAAC;QACV,KAAK,CAAC,IAAI,EAAE,CAAC;QACb,OAAO,KAAK,CAAC;IACf,CAAC;IAED,KAAK,CAAC,MAAM,EAAE,CAAC;IACf,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,MAAM,UAAU,GAAG,CAAC,QAAgB,EAAE,OAAe;IACnD,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO;QAAE,OAAO;IAClC,IAAI,OAAO,CAAC,MAAM,GAAG,cAAc;QAAE,OAAO;IAE5C,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC;IACvB,MAAM,KAAK,GAAe;QACxB,GAAG,EAAE,QAAQ;QACb,OAAO;QACP,SAAS,EAAE,GAAG,CAAC,WAAW,EAAE;QAC5B,SAAS,EAAE,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,CAAC,WAAW,EAAE;KAC3E,CAAC;IAEF,kEAAkE;IAClE,KAAK,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;IAC3B,KAAK,CAAC,IAAI,EAAE,CAAC;AACf,CAAC;AAED,MAAM,UAAU,KAAK;IACnB,KAAK,CAAC,QAAQ,EAAE,CAAC;AACnB,CAAC;AAED,MAAM,UAAU,QAAQ;IACtB,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,GAAG,KAAK,CAAC,MAAM,CAAC;IACxC,MAAM,OAAO,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,GAAG,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IAE7E,OAAO;QACL,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,CAAC,MAAM;QACzB,OAAO,EAAE,MAAM,CAAC,KAAK,CAAC,OAAO;QAC7B,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG;QACrB,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,MAAM,EAAE,KAAK,CAAC,MAAM;QACpB,IAAI,EAAE,KAAK,CAAC,IAAI;QAChB,OAAO,EAAE,GAAG,OAAO,GAAG;KACvB,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,IAAI;IAClB,OAAO,KAAK,CAAC,IAAI,EAAE,CAAC;AACtB,CAAC"}
@@ -0,0 +1,52 @@
1
+ import { CacheEntry } from '../types/index.js';
2
+ declare class CacheService {
3
+ private cache;
4
+ private stats;
5
+ constructor();
6
+ /**
7
+ * Generates a namespaced hash for a cache key
8
+ * @param key - The key to hash (can include namespace prefix)
9
+ */
10
+ private hashKey;
11
+ /**
12
+ * Creates a namespaced cache key
13
+ * @param namespace - The namespace (e.g., 'url', 'links', 'markdown')
14
+ * @param url - The URL to cache
15
+ */
16
+ createCacheKey(namespace: string, url: string): string;
17
+ /**
18
+ * Gets content from cache
19
+ * @param cacheKey - The cache key (use createCacheKey for namespaced keys)
20
+ */
21
+ get(cacheKey: string): CacheEntry | undefined;
22
+ private readonly maxContentSize;
23
+ /**
24
+ * Sets content in cache
25
+ * @param cacheKey - The cache key (use createCacheKey for namespaced keys)
26
+ * @param content - The content to cache
27
+ */
28
+ set(cacheKey: string, content: string): void;
29
+ /**
30
+ * Clears all cache
31
+ */
32
+ clear(): void;
33
+ /**
34
+ * Gets cache statistics
35
+ */
36
+ getStats(): {
37
+ size: number;
38
+ maxKeys: number;
39
+ ttl: number;
40
+ hits: number;
41
+ misses: number;
42
+ sets: number;
43
+ hitRate: string;
44
+ };
45
+ /**
46
+ * Gets all cache keys
47
+ */
48
+ keys(): string[];
49
+ }
50
+ export declare const cacheService: CacheService;
51
+ export {};
52
+ //# sourceMappingURL=cache.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cache.service.d.ts","sourceRoot":"","sources":["../../src/services/cache.service.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAE/C,cAAM,YAAY;IAChB,OAAO,CAAC,KAAK,CAAY;IACzB,OAAO,CAAC,KAAK,CAIX;;IAWF;;;OAGG;IACH,OAAO,CAAC,OAAO;IAQf;;;;OAIG;IACH,cAAc,CAAC,SAAS,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM;IAItD;;;OAGG;IACH,GAAG,CAAC,QAAQ,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS;IAkB7C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAG7B;IAEF;;;;OAIG;IACH,GAAG,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,IAAI;IAyB5C;;OAEG;IACH,KAAK,IAAI,IAAI;IAIb;;OAEG;IACH,QAAQ;;;;;;;;;IAgBR;;OAEG;IACH,IAAI,IAAI,MAAM,EAAE;CAGjB;AAGD,eAAO,MAAM,YAAY,cAAqB,CAAC"}
@@ -0,0 +1,113 @@
1
+ import NodeCache from 'node-cache';
2
+ import crypto from 'crypto';
3
+ import { config } from '../config/index.js';
4
+ class CacheService {
5
+ cache;
6
+ stats = {
7
+ hits: 0,
8
+ misses: 0,
9
+ sets: 0,
10
+ };
11
+ constructor() {
12
+ this.cache = new NodeCache({
13
+ stdTTL: config.cache.ttl,
14
+ checkperiod: Math.floor(config.cache.ttl / 10),
15
+ useClones: false,
16
+ maxKeys: config.cache.maxKeys,
17
+ });
18
+ }
19
+ /**
20
+ * Generates a namespaced hash for a cache key
21
+ * @param key - The key to hash (can include namespace prefix)
22
+ */
23
+ hashKey(key) {
24
+ return crypto
25
+ .createHash('sha256')
26
+ .update(key)
27
+ .digest('hex')
28
+ .substring(0, 16);
29
+ }
30
+ /**
31
+ * Creates a namespaced cache key
32
+ * @param namespace - The namespace (e.g., 'url', 'links', 'markdown')
33
+ * @param url - The URL to cache
34
+ */
35
+ createCacheKey(namespace, url) {
36
+ return `${namespace}:${url}`;
37
+ }
38
+ /**
39
+ * Gets content from cache
40
+ * @param cacheKey - The cache key (use createCacheKey for namespaced keys)
41
+ */
42
+ get(cacheKey) {
43
+ if (!config.cache.enabled) {
44
+ return undefined;
45
+ }
46
+ const key = this.hashKey(cacheKey);
47
+ const entry = this.cache.get(key);
48
+ if (entry) {
49
+ this.stats.hits++;
50
+ return entry;
51
+ }
52
+ this.stats.misses++;
53
+ return undefined;
54
+ }
55
+ // Maximum content size to cache (5MB default)
56
+ maxContentSize = parseInt(process.env.CACHE_MAX_CONTENT_SIZE || '5242880', 10);
57
+ /**
58
+ * Sets content in cache
59
+ * @param cacheKey - The cache key (use createCacheKey for namespaced keys)
60
+ * @param content - The content to cache
61
+ */
62
+ set(cacheKey, content) {
63
+ if (!config.cache.enabled) {
64
+ return;
65
+ }
66
+ // Validate content size to prevent memory exhaustion
67
+ if (content.length > this.maxContentSize) {
68
+ return; // Skip caching oversized content
69
+ }
70
+ const key = this.hashKey(cacheKey);
71
+ const now = new Date();
72
+ const expiresAt = new Date(now.getTime() + config.cache.ttl * 1000);
73
+ const entry = {
74
+ url: cacheKey,
75
+ content,
76
+ fetchedAt: now.toISOString(),
77
+ expiresAt: expiresAt.toISOString(),
78
+ };
79
+ this.cache.set(key, entry);
80
+ this.stats.sets++;
81
+ }
82
+ /**
83
+ * Clears all cache
84
+ */
85
+ clear() {
86
+ this.cache.flushAll();
87
+ }
88
+ /**
89
+ * Gets cache statistics
90
+ */
91
+ getStats() {
92
+ const total = this.stats.hits + this.stats.misses;
93
+ const hitRate = total > 0 ? ((this.stats.hits / total) * 100).toFixed(2) : '0.00';
94
+ return {
95
+ size: this.cache.keys().length,
96
+ maxKeys: config.cache.maxKeys,
97
+ ttl: config.cache.ttl,
98
+ hits: this.stats.hits,
99
+ misses: this.stats.misses,
100
+ sets: this.stats.sets,
101
+ hitRate: `${hitRate}%`,
102
+ };
103
+ }
104
+ /**
105
+ * Gets all cache keys
106
+ */
107
+ keys() {
108
+ return this.cache.keys();
109
+ }
110
+ }
111
+ // Singleton instance
112
+ export const cacheService = new CacheService();
113
+ //# sourceMappingURL=cache.service.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cache.service.js","sourceRoot":"","sources":["../../src/services/cache.service.ts"],"names":[],"mappings":"AAAA,OAAO,SAAS,MAAM,YAAY,CAAC;AACnC,OAAO,MAAM,MAAM,QAAQ,CAAC;AAC5B,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAG5C,MAAM,YAAY;IACR,KAAK,CAAY;IACjB,KAAK,GAAG;QACd,IAAI,EAAE,CAAC;QACP,MAAM,EAAE,CAAC;QACT,IAAI,EAAE,CAAC;KACR,CAAC;IAEF;QACE,IAAI,CAAC,KAAK,GAAG,IAAI,SAAS,CAAC;YACzB,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG;YACxB,WAAW,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,EAAE,CAAC;YAC9C,SAAS,EAAE,KAAK;YAChB,OAAO,EAAE,MAAM,CAAC,KAAK,CAAC,OAAO;SAC9B,CAAC,CAAC;IACL,CAAC;IAED;;;OAGG;IACK,OAAO,CAAC,GAAW;QACzB,OAAO,MAAM;aACV,UAAU,CAAC,QAAQ,CAAC;aACpB,MAAM,CAAC,GAAG,CAAC;aACX,MAAM,CAAC,KAAK,CAAC;aACb,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACtB,CAAC;IAED;;;;OAIG;IACH,cAAc,CAAC,SAAiB,EAAE,GAAW;QAC3C,OAAO,GAAG,SAAS,IAAI,GAAG,EAAE,CAAC;IAC/B,CAAC;IAED;;;OAGG;IACH,GAAG,CAAC,QAAgB;QAClB,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YAC1B,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAa,GAAG,CAAC,CAAC;QAE9C,IAAI,KAAK,EAAE,CAAC;YACV,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;YAClB,OAAO,KAAK,CAAC;QACf,CAAC;QAED,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC;QACpB,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,8CAA8C;IAC7B,cAAc,GAAG,QAAQ,CACxC,OAAO,CAAC,GAAG,CAAC,sBAAsB,IAAI,SAAS,EAC/C,EAAE,CACH,CAAC;IAEF;;;;OAIG;IACH,GAAG,CAAC,QAAgB,EAAE,OAAe;QACnC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;YAC1B,OAAO;QACT,CAAC;QAED,qDAAqD;QACrD,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,cAAc,EAAE,CAAC;YACzC,OAAO,CAAC,iCAAiC;QAC3C,CAAC;QAED,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC;QACnC,MAAM,GAAG,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,SAAS,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,GAAG,IAAI,CAAC,CAAC;QAEpE,MAAM,KAAK,GAAe;YACxB,GAAG,EAAE,QAAQ;YACb,OAAO;YACP,SAAS,EAAE,GAAG,CAAC,WAAW,EAAE;YAC5B,SAAS,EAAE,SAAS,CAAC,WAAW,EAAE;SACnC,CAAC;QAEF,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QAC3B,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;IACpB,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;IACxB,CAAC;IAED;;OAEG;IACH,QAAQ;QACN,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC;QAClD,MAAM,OAAO,GACX,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,GAAG,KAAK,CAAC,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAEpE,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,MAAM;YAC9B,OAAO,EAAE,MAAM,CAAC,KAAK,CAAC,OAAO;YAC7B,GAAG,EAAE,MAAM,CAAC,KAAK,CAAC,GAAG;YACrB,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACrB,MAAM,EAAE,IAAI,CAAC,KAAK,CAAC,MAAM;YACzB,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACrB,OAAO,EAAE,GAAG,OAAO,GAAG;SACvB,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,IAAI;QACF,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;IAC3B,CAAC;CACF;AAED,qBAAqB;AACrB,MAAM,CAAC,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC"}
@@ -0,0 +1,32 @@
1
+ import type { ExtractedArticle } from '../types/index.js';
2
+ /**
3
+ * Metadata extracted from HTML document
4
+ */
5
+ export interface ExtractedMetadata {
6
+ title?: string;
7
+ description?: string;
8
+ author?: string;
9
+ }
10
+ /**
11
+ * Combined extraction result from a single JSDOM parse
12
+ */
13
+ export interface ExtractionResult {
14
+ article: ExtractedArticle | null;
15
+ metadata: ExtractedMetadata;
16
+ }
17
+ /**
18
+ * Extracts both article content and metadata from HTML in a single JSDOM parse.
19
+ * This is more efficient than calling extractArticle and extractMetadata separately.
20
+ */
21
+ export declare function extractContent(html: string, url: string): ExtractionResult;
22
+ /**
23
+ * Extracts main article content using Mozilla Readability
24
+ * @deprecated Use extractContent() for better performance when you need both article and metadata
25
+ */
26
+ export declare function extractArticle(html: string, url: string): ExtractedArticle | null;
27
+ /**
28
+ * Extracts metadata from HTML
29
+ * @deprecated Use extractContent() for better performance when you need both article and metadata
30
+ */
31
+ export declare function extractMetadata(html: string): ExtractedMetadata;
32
+ //# sourceMappingURL=extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../../src/services/extractor.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAG1D;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,gBAAgB,GAAG,IAAI,CAAC;IACjC,QAAQ,EAAE,iBAAiB,CAAC;CAC7B;AAuDD;;;GAGG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,gBAAgB,CAgB1E;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI,CAQjF;AAED;;;GAGG;AACH,wBAAgB,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG,iBAAiB,CAQ/D"}
@@ -0,0 +1,97 @@
1
+ import { JSDOM } from 'jsdom';
2
+ import { Readability } from '@mozilla/readability';
3
+ import { logError } from './logger.js';
4
+ function getMetaContent(document, selectors) {
5
+ for (const selector of selectors) {
6
+ const content = document.querySelector(selector)?.getAttribute('content');
7
+ if (content)
8
+ return content;
9
+ }
10
+ return undefined;
11
+ }
12
+ /**
13
+ * Extracts metadata from a pre-parsed Document
14
+ */
15
+ function extractMetadataFromDocument(document) {
16
+ const title = getMetaContent(document, [
17
+ 'meta[property="og:title"]',
18
+ 'meta[name="twitter:title"]',
19
+ ]) ?? document.querySelector('title')?.textContent ?? undefined;
20
+ const description = getMetaContent(document, [
21
+ 'meta[property="og:description"]',
22
+ 'meta[name="twitter:description"]',
23
+ 'meta[name="description"]',
24
+ ]);
25
+ const author = getMetaContent(document, [
26
+ 'meta[name="author"]',
27
+ 'meta[property="article:author"]',
28
+ ]);
29
+ return { title, description, author };
30
+ }
31
+ /**
32
+ * Extracts article content from a pre-parsed Document using Readability
33
+ */
34
+ function extractArticleFromDocument(document) {
35
+ // Clone the document since Readability mutates it
36
+ const clonedDoc = document.cloneNode(true);
37
+ const reader = new Readability(clonedDoc);
38
+ const article = reader.parse();
39
+ if (!article)
40
+ return null;
41
+ return {
42
+ title: article.title ?? undefined,
43
+ byline: article.byline ?? undefined,
44
+ content: article.content ?? '',
45
+ textContent: article.textContent ?? '',
46
+ excerpt: article.excerpt ?? undefined,
47
+ siteName: article.siteName ?? undefined,
48
+ };
49
+ }
50
+ /**
51
+ * Extracts both article content and metadata from HTML in a single JSDOM parse.
52
+ * This is more efficient than calling extractArticle and extractMetadata separately.
53
+ */
54
+ export function extractContent(html, url) {
55
+ try {
56
+ const dom = new JSDOM(html, { url });
57
+ const document = dom.window.document;
58
+ // Extract metadata first (non-destructive)
59
+ const metadata = extractMetadataFromDocument(document);
60
+ // Extract article (uses cloned document since Readability mutates)
61
+ const article = extractArticleFromDocument(document);
62
+ return { article, metadata };
63
+ }
64
+ catch (error) {
65
+ logError('Failed to extract content', error instanceof Error ? error : undefined);
66
+ return { article: null, metadata: {} };
67
+ }
68
+ }
69
+ /**
70
+ * Extracts main article content using Mozilla Readability
71
+ * @deprecated Use extractContent() for better performance when you need both article and metadata
72
+ */
73
+ export function extractArticle(html, url) {
74
+ try {
75
+ const dom = new JSDOM(html, { url });
76
+ return extractArticleFromDocument(dom.window.document);
77
+ }
78
+ catch (error) {
79
+ logError('Failed to extract article', error instanceof Error ? error : undefined);
80
+ return null;
81
+ }
82
+ }
83
+ /**
84
+ * Extracts metadata from HTML
85
+ * @deprecated Use extractContent() for better performance when you need both article and metadata
86
+ */
87
+ export function extractMetadata(html) {
88
+ try {
89
+ const { document } = new JSDOM(html).window;
90
+ return extractMetadataFromDocument(document);
91
+ }
92
+ catch (error) {
93
+ logError('Failed to extract metadata', error instanceof Error ? error : undefined);
94
+ return {};
95
+ }
96
+ }
97
+ //# sourceMappingURL=extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.js","sourceRoot":"","sources":["../../src/services/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD,OAAO,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAmBvC,SAAS,cAAc,CAAC,QAAkB,EAAE,SAAmB;IAC7D,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,MAAM,OAAO,GAAG,QAAQ,CAAC,aAAa,CAAC,QAAQ,CAAC,EAAE,YAAY,CAAC,SAAS,CAAC,CAAC;QAC1E,IAAI,OAAO;YAAE,OAAO,OAAO,CAAC;IAC9B,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;GAEG;AACH,SAAS,2BAA2B,CAAC,QAAkB;IACrD,MAAM,KAAK,GACT,cAAc,CAAC,QAAQ,EAAE;QACvB,2BAA2B;QAC3B,4BAA4B;KAC7B,CAAC,IAAI,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,WAAW,IAAI,SAAS,CAAC;IAElE,MAAM,WAAW,GAAG,cAAc,CAAC,QAAQ,EAAE;QAC3C,iCAAiC;QACjC,kCAAkC;QAClC,0BAA0B;KAC3B,CAAC,CAAC;IAEH,MAAM,MAAM,GAAG,cAAc,CAAC,QAAQ,EAAE;QACtC,qBAAqB;QACrB,iCAAiC;KAClC,CAAC,CAAC;IAEH,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC;AACxC,CAAC;AAED;;GAEG;AACH,SAAS,0BAA0B,CAAC,QAAkB;IACpD,kDAAkD;IAClD,MAAM,SAAS,GAAG,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAa,CAAC;IACvD,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,SAAS,CAAC,CAAC;IAC1C,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;IAE/B,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAC;IAE1B,OAAO;QACL,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,SAAS;QACjC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;QACnC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE;QAC9B,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,EAAE;QACtC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,SAAS;QACrC,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,SAAS;KACxC,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,GAAW;IACtD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;QACrC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;QAErC,2CAA2C;QAC3C,MAAM,QAAQ,GAAG,2BAA2B,CAAC,QAAQ,CAAC,CAAC;QAEvD,mEAAmE;QACnE,MAAM,OAAO,GAAG,0BAA0B,CAAC,QAAQ,CAAC,CAAC;QAErD,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC;IAC/B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,QAAQ,CAAC,2BAA2B,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QAClF,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,EAAE,CAAC;IACzC,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,IAAY,EAAE,GAAW;IACtD,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;QACrC,OAAO,0BAA0B,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;IACzD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,QAAQ,CAAC,2BAA2B,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QAClF,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,eAAe,CAAC,IAAY;IAC1C,IAAI,CAAC;QACH,MAAM,EAAE,QAAQ,EAAE,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;QAC5C,OAAO,2BAA2B,CAAC,QAAQ,CAAC,CAAC;IAC/C,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,QAAQ,CAAC,4BAA4B,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACnF,OAAO,EAAE,CAAC;IACZ,CAAC;AACH,CAAC"}
@@ -0,0 +1,18 @@
1
+ import type { ExtractedArticle } from '../types/index.js';
2
+ declare class ExtractorService {
3
+ /**
4
+ * Extracts main article content using Mozilla Readability
5
+ */
6
+ extractArticle(html: string, url: string): ExtractedArticle | null;
7
+ /**
8
+ * Extracts metadata from HTML
9
+ */
10
+ extractMetadata(html: string): {
11
+ title?: string;
12
+ description?: string;
13
+ author?: string;
14
+ };
15
+ }
16
+ export declare const extractorService: ExtractorService;
17
+ export {};
18
+ //# sourceMappingURL=extractor.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.service.d.ts","sourceRoot":"","sources":["../../src/services/extractor.service.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAG1D,cAAM,gBAAgB;IACpB;;OAEG;IACH,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,gBAAgB,GAAG,IAAI;IA2BlE;;OAEG;IACH,eAAe,CAAC,IAAI,EAAE,MAAM,GAAG;QAC7B,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,MAAM,CAAC,EAAE,MAAM,CAAC;KACjB;CAgDF;AAGD,eAAO,MAAM,gBAAgB,kBAAyB,CAAC"}
@@ -0,0 +1,75 @@
1
+ import { JSDOM } from 'jsdom';
2
+ import { Readability } from '@mozilla/readability';
3
+ import { logError } from './logger.service.js';
4
+ class ExtractorService {
5
+ /**
6
+ * Extracts main article content using Mozilla Readability
7
+ */
8
+ extractArticle(html, url) {
9
+ try {
10
+ const dom = new JSDOM(html, { url });
11
+ const reader = new Readability(dom.window.document);
12
+ const article = reader.parse();
13
+ if (!article) {
14
+ return null;
15
+ }
16
+ return {
17
+ title: article.title ?? undefined,
18
+ byline: article.byline ?? undefined,
19
+ content: article.content ?? '',
20
+ textContent: article.textContent ?? '',
21
+ excerpt: article.excerpt ?? undefined,
22
+ siteName: article.siteName ?? undefined,
23
+ };
24
+ }
25
+ catch (error) {
26
+ logError('Failed to extract article', error instanceof Error ? error : undefined);
27
+ return null;
28
+ }
29
+ }
30
+ /**
31
+ * Extracts metadata from HTML
32
+ */
33
+ extractMetadata(html) {
34
+ try {
35
+ const dom = new JSDOM(html);
36
+ const document = dom.window.document;
37
+ // Try to get title
38
+ const title = document
39
+ .querySelector('meta[property="og:title"]')
40
+ ?.getAttribute('content') ||
41
+ document
42
+ .querySelector('meta[name="twitter:title"]')
43
+ ?.getAttribute('content') ||
44
+ document.querySelector('title')?.textContent ||
45
+ undefined;
46
+ // Try to get description
47
+ const description = document
48
+ .querySelector('meta[property="og:description"]')
49
+ ?.getAttribute('content') ||
50
+ document
51
+ .querySelector('meta[name="twitter:description"]')
52
+ ?.getAttribute('content') ||
53
+ document
54
+ .querySelector('meta[name="description"]')
55
+ ?.getAttribute('content') ||
56
+ undefined;
57
+ // Try to get author
58
+ const author = document
59
+ .querySelector('meta[name="author"]')
60
+ ?.getAttribute('content') ||
61
+ document
62
+ .querySelector('meta[property="article:author"]')
63
+ ?.getAttribute('content') ||
64
+ undefined;
65
+ return { title, description, author };
66
+ }
67
+ catch (error) {
68
+ logError('Failed to extract metadata', error instanceof Error ? error : undefined);
69
+ return {};
70
+ }
71
+ }
72
+ }
73
+ // Singleton instance
74
+ export const extractorService = new ExtractorService();
75
+ //# sourceMappingURL=extractor.service.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.service.js","sourceRoot":"","sources":["../../src/services/extractor.service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,OAAO,CAAC;AAC9B,OAAO,EAAE,WAAW,EAAE,MAAM,sBAAsB,CAAC;AAEnD,OAAO,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAC;AAE/C,MAAM,gBAAgB;IACpB;;OAEG;IACH,cAAc,CAAC,IAAY,EAAE,GAAW;QACtC,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,EAAE,EAAE,GAAG,EAAE,CAAC,CAAC;YACrC,MAAM,MAAM,GAAG,IAAI,WAAW,CAAC,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YACpD,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,EAAE,CAAC;YAE/B,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,OAAO,IAAI,CAAC;YACd,CAAC;YAED,OAAO;gBACL,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,SAAS;gBACjC,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,SAAS;gBACnC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,EAAE;gBAC9B,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,EAAE;gBACtC,OAAO,EAAE,OAAO,CAAC,OAAO,IAAI,SAAS;gBACrC,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,SAAS;aACxC,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,QAAQ,CACN,2BAA2B,EAC3B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAC3C,CAAC;YACF,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,IAAY;QAK1B,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,KAAK,CAAC,IAAI,CAAC,CAAC;YAC5B,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;YAErC,mBAAmB;YACnB,MAAM,KAAK,GACT,QAAQ;iBACL,aAAa,CAAC,2BAA2B,CAAC;gBAC3C,EAAE,YAAY,CAAC,SAAS,CAAC;gBAC3B,QAAQ;qBACL,aAAa,CAAC,4BAA4B,CAAC;oBAC5C,EAAE,YAAY,CAAC,SAAS,CAAC;gBAC3B,QAAQ,CAAC,aAAa,CAAC,OAAO,CAAC,EAAE,WAAW;gBAC5C,SAAS,CAAC;YAEZ,yBAAyB;YACzB,MAAM,WAAW,GACf,QAAQ;iBACL,aAAa,CAAC,iCAAiC,CAAC;gBACjD,EAAE,YAAY,CAAC,SAAS,CAAC;gBAC3B,QAAQ;qBACL,aAAa,CAAC,kCAAkC,CAAC;oBAClD,EAAE,YAAY,CAAC,SAAS,CAAC;gBAC3B,QAAQ;qBACL,aAAa,CAAC,0BAA0B,CAAC;oBAC1C,EAAE,YAAY,CAAC,SAAS,CAAC;gBAC3B,SAAS,CAAC;YAEZ,oBAAoB;YACpB,MAAM,MAAM,GACV,QAAQ;iBACL,aAAa,CAAC,qBAAqB,CAAC;gBACrC,EAAE,YAAY,CAAC,SAAS,CAAC;gBAC3B,QAAQ;qBACL,aAAa,CAAC,iCAAiC,CAAC;oBACjD,EAAE,YAAY,CAAC,SAAS,CAAC;gBAC3B,SAAS,CAAC;YAEZ,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC;QACxC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,QAAQ,CACN,4BAA4B,EAC5B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,SAAS,CAC3C,CAAC;YACF,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;CACF;AAED,qBAAqB;AACrB,MAAM,CAAC,MAAM,gBAAgB,GAAG,IAAI,gBAAgB,EAAE,CAAC"}
@@ -0,0 +1,9 @@
1
+ /**
2
+ * Fetches HTML content from a URL
3
+ */
4
+ export declare function fetchUrl(url: string, customHeaders?: Record<string, string>): Promise<string>;
5
+ /**
6
+ * Fetches URL with exponential backoff retry logic
7
+ */
8
+ export declare function fetchUrlWithRetry(url: string, customHeaders?: Record<string, string>, maxRetries?: number): Promise<string>;
9
+ //# sourceMappingURL=fetcher.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetcher.d.ts","sourceRoot":"","sources":["../../src/services/fetcher.ts"],"names":[],"mappings":"AAgDA;;GAEG;AACH,wBAAsB,QAAQ,CAC5B,GAAG,EAAE,MAAM,EACX,aAAa,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GACrC,OAAO,CAAC,MAAM,CAAC,CAyCjB;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CACrC,GAAG,EAAE,MAAM,EACX,aAAa,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EACtC,UAAU,SAAI,GACb,OAAO,CAAC,MAAM,CAAC,CA4BjB"}
@@ -0,0 +1,100 @@
1
+ import axios from 'axios';
2
+ import { config } from '../config/index.js';
3
+ import { FetchError, TimeoutError } from '../errors/app-error.js';
4
+ const BLOCKED_HEADERS = new Set([
5
+ 'host',
6
+ 'authorization',
7
+ 'cookie',
8
+ 'x-forwarded-for',
9
+ 'x-real-ip',
10
+ 'proxy-authorization',
11
+ ]);
12
+ function sanitizeHeaders(headers) {
13
+ if (!headers || Object.keys(headers).length === 0)
14
+ return undefined;
15
+ const sanitized = {};
16
+ for (const [key, value] of Object.entries(headers)) {
17
+ if (!BLOCKED_HEADERS.has(key.toLowerCase())) {
18
+ sanitized[key] = value;
19
+ }
20
+ }
21
+ return Object.keys(sanitized).length > 0 ? sanitized : undefined;
22
+ }
23
+ function calculateBackoff(attempt, maxDelay = 10000) {
24
+ const baseDelay = Math.min(1000 * Math.pow(2, attempt - 1), maxDelay);
25
+ const jitter = baseDelay * 0.25 * (Math.random() * 2 - 1);
26
+ return Math.round(baseDelay + jitter);
27
+ }
28
+ const client = axios.create({
29
+ timeout: config.fetcher.timeout,
30
+ maxRedirects: config.fetcher.maxRedirects,
31
+ maxContentLength: config.fetcher.maxContentLength,
32
+ headers: {
33
+ 'User-Agent': config.fetcher.userAgent,
34
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
35
+ 'Accept-Language': 'en-US,en;q=0.5',
36
+ 'Accept-Encoding': 'gzip, deflate, br',
37
+ Connection: 'keep-alive',
38
+ },
39
+ validateStatus: (status) => status >= 200 && status < 300,
40
+ });
41
+ /**
42
+ * Fetches HTML content from a URL
43
+ */
44
+ export async function fetchUrl(url, customHeaders) {
45
+ const requestConfig = {
46
+ method: 'GET',
47
+ url,
48
+ responseType: 'text',
49
+ };
50
+ const sanitized = sanitizeHeaders(customHeaders);
51
+ if (sanitized) {
52
+ requestConfig.headers = { ...requestConfig.headers, ...sanitized };
53
+ }
54
+ try {
55
+ const response = await client.request(requestConfig);
56
+ return response.data;
57
+ }
58
+ catch (error) {
59
+ if (!axios.isAxiosError(error)) {
60
+ throw new FetchError(`Unexpected error: ${error instanceof Error ? error.message : 'Unknown'}`, url);
61
+ }
62
+ if (error.code === 'ECONNABORTED' || error.code === 'ETIMEDOUT') {
63
+ throw new TimeoutError(config.fetcher.timeout, true);
64
+ }
65
+ if (error.response) {
66
+ throw new FetchError(`HTTP ${error.response.status}: ${error.response.statusText}`, url, error.response.status);
67
+ }
68
+ if (error.request) {
69
+ throw new FetchError(`Network error: Could not reach ${url}`, url);
70
+ }
71
+ throw new FetchError(error.message, url);
72
+ }
73
+ }
74
+ /**
75
+ * Fetches URL with exponential backoff retry logic
76
+ */
77
+ export async function fetchUrlWithRetry(url, customHeaders, maxRetries = 3) {
78
+ let lastError;
79
+ for (let attempt = 1; attempt <= maxRetries; attempt++) {
80
+ try {
81
+ return await fetchUrl(url, customHeaders);
82
+ }
83
+ catch (error) {
84
+ lastError = error instanceof Error ? error : new Error('Unknown error');
85
+ // Don't retry on client errors (4xx) except 429 (rate limited)
86
+ if (error instanceof FetchError && error.httpStatus) {
87
+ const status = error.httpStatus;
88
+ if (status >= 400 && status < 500 && status !== 429) {
89
+ throw error;
90
+ }
91
+ }
92
+ if (attempt < maxRetries) {
93
+ const delay = calculateBackoff(attempt);
94
+ await new Promise((resolve) => setTimeout(resolve, delay));
95
+ }
96
+ }
97
+ }
98
+ throw new FetchError(`Failed after ${maxRetries} attempts: ${lastError?.message ?? 'Unknown error'}`, url);
99
+ }
100
+ //# sourceMappingURL=fetcher.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetcher.js","sourceRoot":"","sources":["../../src/services/fetcher.ts"],"names":[],"mappings":"AAAA,OAAO,KAA6B,MAAM,OAAO,CAAC;AAClD,OAAO,EAAE,MAAM,EAAE,MAAM,oBAAoB,CAAC;AAC5C,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAElE,MAAM,eAAe,GAAG,IAAI,GAAG,CAAC;IAC9B,MAAM;IACN,eAAe;IACf,QAAQ;IACR,iBAAiB;IACjB,WAAW;IACX,qBAAqB;CACtB,CAAC,CAAC;AAEH,SAAS,eAAe,CACtB,OAAgC;IAEhC,IAAI,CAAC,OAAO,IAAI,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,SAAS,CAAC;IAEpE,MAAM,SAAS,GAA2B,EAAE,CAAC;IAC7C,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;QACnD,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;YAC5C,SAAS,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;QACzB,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC;AACnE,CAAC;AAED,SAAS,gBAAgB,CAAC,OAAe,EAAE,QAAQ,GAAG,KAAK;IACzD,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,GAAG,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC;IACtE,MAAM,MAAM,GAAG,SAAS,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1D,OAAO,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC;AACxC,CAAC;AAED,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;IAC1B,OAAO,EAAE,MAAM,CAAC,OAAO,CAAC,OAAO;IAC/B,YAAY,EAAE,MAAM,CAAC,OAAO,CAAC,YAAY;IACzC,gBAAgB,EAAE,MAAM,CAAC,OAAO,CAAC,gBAAgB;IACjD,OAAO,EAAE;QACP,YAAY,EAAE,MAAM,CAAC,OAAO,CAAC,SAAS;QACtC,MAAM,EAAE,4EAA4E;QACpF,iBAAiB,EAAE,gBAAgB;QACnC,iBAAiB,EAAE,mBAAmB;QACtC,UAAU,EAAE,YAAY;KACzB;IACD,cAAc,EAAE,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,IAAI,GAAG,IAAI,MAAM,GAAG,GAAG;CAC1D,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,GAAW,EACX,aAAsC;IAEtC,MAAM,aAAa,GAAuB;QACxC,MAAM,EAAE,KAAK;QACb,GAAG;QACH,YAAY,EAAE,MAAM;KACrB,CAAC;IAEF,MAAM,SAAS,GAAG,eAAe,CAAC,aAAa,CAAC,CAAC;IACjD,IAAI,SAAS,EAAE,CAAC;QACd,aAAa,CAAC,OAAO,GAAG,EAAE,GAAG,aAAa,CAAC,OAAO,EAAE,GAAG,SAAS,EAAE,CAAC;IACrE,CAAC;IAED,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,OAAO,CAAS,aAAa,CAAC,CAAC;QAC7D,OAAO,QAAQ,CAAC,IAAI,CAAC;IACvB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;YAC/B,MAAM,IAAI,UAAU,CAClB,qBAAqB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,EAAE,EACzE,GAAG,CACJ,CAAC;QACJ,CAAC;QAED,IAAI,KAAK,CAAC,IAAI,KAAK,cAAc,IAAI,KAAK,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;YAChE,MAAM,IAAI,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;QACvD,CAAC;QAED,IAAI,KAAK,CAAC,QAAQ,EAAE,CAAC;YACnB,MAAM,IAAI,UAAU,CAClB,QAAQ,KAAK,CAAC,QAAQ,CAAC,MAAM,KAAK,KAAK,CAAC,QAAQ,CAAC,UAAU,EAAE,EAC7D,GAAG,EACH,KAAK,CAAC,QAAQ,CAAC,MAAM,CACtB,CAAC;QACJ,CAAC;QAED,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;YAClB,MAAM,IAAI,UAAU,CAAC,kCAAkC,GAAG,EAAE,EAAE,GAAG,CAAC,CAAC;QACrE,CAAC;QAED,MAAM,IAAI,UAAU,CAAC,KAAK,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;IAC3C,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,GAAW,EACX,aAAsC,EACtC,UAAU,GAAG,CAAC;IAEd,IAAI,SAA4B,CAAC;IAEjC,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,UAAU,EAAE,OAAO,EAAE,EAAE,CAAC;QACvD,IAAI,CAAC;YACH,OAAO,MAAM,QAAQ,CAAC,GAAG,EAAE,aAAa,CAAC,CAAC;QAC5C,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,SAAS,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC,eAAe,CAAC,CAAC;YAExE,+DAA+D;YAC/D,IAAI,KAAK,YAAY,UAAU,IAAI,KAAK,CAAC,UAAU,EAAE,CAAC;gBACpD,MAAM,MAAM,GAAG,KAAK,CAAC,UAAU,CAAC;gBAChC,IAAI,MAAM,IAAI,GAAG,IAAI,MAAM,GAAG,GAAG,IAAI,MAAM,KAAK,GAAG,EAAE,CAAC;oBACpD,MAAM,KAAK,CAAC;gBACd,CAAC;YACH,CAAC;YAED,IAAI,OAAO,GAAG,UAAU,EAAE,CAAC;gBACzB,MAAM,KAAK,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;gBACxC,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC;IACH,CAAC;IAED,MAAM,IAAI,UAAU,CAClB,gBAAgB,UAAU,cAAc,SAAS,EAAE,OAAO,IAAI,eAAe,EAAE,EAC/E,GAAG,CACJ,CAAC;AACJ,CAAC"}
@@ -0,0 +1,18 @@
1
+ declare class FetcherService {
2
+ private readonly client;
3
+ constructor();
4
+ /**
5
+ * Fetches HTML content from a URL
6
+ * @throws {FetchError} on network or HTTP errors
7
+ * @throws {TimeoutError} on request timeout
8
+ */
9
+ fetchUrl(url: string, customHeaders?: Record<string, string>): Promise<string>;
10
+ /**
11
+ * Fetches URL with exponential backoff retry logic
12
+ * @throws {FetchError} after all retries exhausted
13
+ */
14
+ fetchUrlWithRetry(url: string, customHeaders?: Record<string, string>, maxRetries?: number): Promise<string>;
15
+ }
16
+ export declare const fetcherService: FetcherService;
17
+ export {};
18
+ //# sourceMappingURL=fetcher.service.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fetcher.service.d.ts","sourceRoot":"","sources":["../../src/services/fetcher.service.ts"],"names":[],"mappings":"AA6CA,cAAM,cAAc;IAClB,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAgB;;IAoBvC;;;;OAIG;IACG,QAAQ,CACZ,GAAG,EAAE,MAAM,EACX,aAAa,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GACrC,OAAO,CAAC,MAAM,CAAC;IA8ClB;;;OAGG;IACG,iBAAiB,CACrB,GAAG,EAAE,MAAM,EACX,aAAa,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EACtC,UAAU,SAAI,GACb,OAAO,CAAC,MAAM,CAAC;CA6BnB;AAGD,eAAO,MAAM,cAAc,gBAAuB,CAAC"}