@maintainabilityai/research-runner 0.1.14 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -40,7 +40,10 @@ async function runHackerNewsSearch(opts) {
|
|
|
40
40
|
fromQuery: query,
|
|
41
41
|
title: r.title,
|
|
42
42
|
url,
|
|
43
|
-
|
|
43
|
+
// Show HN / Ask HN posts carry a self-post body. URL-only
|
|
44
|
+
// submissions have no body — leave empty; the synth agent
|
|
45
|
+
// can read the linked URL if it needs more context.
|
|
46
|
+
content: r.storyText,
|
|
44
47
|
score: pointsToScore(r.points),
|
|
45
48
|
publishedDate: r.createdAt || undefined,
|
|
46
49
|
authors: r.author ? [r.author] : undefined,
|
|
@@ -15,6 +15,8 @@ export interface HackerNewsResult {
|
|
|
15
15
|
points: number;
|
|
16
16
|
numComments: number;
|
|
17
17
|
createdAt: string;
|
|
18
|
+
/** Self-post body for Show HN / Ask HN; empty for URL submissions. */
|
|
19
|
+
storyText: string;
|
|
18
20
|
}
|
|
19
21
|
export interface HackerNewsSearchOpts {
|
|
20
22
|
query: string;
|
|
@@ -39,6 +39,29 @@ async function hackerNewsSearch(opts) {
|
|
|
39
39
|
points: h.points ?? 0,
|
|
40
40
|
numComments: h.num_comments ?? 0,
|
|
41
41
|
createdAt: h.created_at ?? '',
|
|
42
|
+
// Algolia returns the self-post body for Show HN / Ask HN. URL-only
|
|
43
|
+
// submissions have this empty. Strip basic HTML tags so the excerpt
|
|
44
|
+
// is readable in the issue comment.
|
|
45
|
+
storyText: stripBasicHtml(h.story_text ?? '').slice(0, 2000),
|
|
42
46
|
})).filter(r => r.objectId && r.title);
|
|
43
47
|
return { query: opts.query, results, responseBytes: Buffer.byteLength(rawText, 'utf8'), httpStatus };
|
|
44
48
|
}
|
|
49
|
+
/**
|
|
50
|
+
* HN Algolia returns Show HN / Ask HN bodies with light HTML
|
|
51
|
+
* (`<p>`, `<i>`, etc.). Strip tags + decode the common entities so
|
|
52
|
+
* the excerpt blockquote in the issue body reads cleanly. No need for
|
|
53
|
+
* a full HTML parser — these posts are plain text with a sprinkle of
|
|
54
|
+
* inline tags.
|
|
55
|
+
*/
|
|
56
|
+
function stripBasicHtml(s) {
|
|
57
|
+
return s
|
|
58
|
+
.replace(/<\/?(?:p|br|i|b|em|strong|code|pre|a|ul|ol|li)[^>]*>/gi, ' ')
|
|
59
|
+
.replace(/ /g, ' ')
|
|
60
|
+
.replace(/&/g, '&')
|
|
61
|
+
.replace(/</g, '<')
|
|
62
|
+
.replace(/>/g, '>')
|
|
63
|
+
.replace(/"/g, '"')
|
|
64
|
+
.replace(/'/g, "'")
|
|
65
|
+
.replace(/\s+/g, ' ')
|
|
66
|
+
.trim();
|
|
67
|
+
}
|
|
@@ -71,7 +71,10 @@ async function usptoSearch(opts) {
|
|
|
71
71
|
patentNumber: num,
|
|
72
72
|
title: meta.inventionTitle ?? '',
|
|
73
73
|
abstract: '',
|
|
74
|
-
|
|
74
|
+
// earliestPublicationNumber already carries the `US` prefix (e.g.
|
|
75
|
+
// `US20260064729A1`); patentNumber is plain digits. Avoid the double
|
|
76
|
+
// `USUS…` URL we were producing.
|
|
77
|
+
url: num ? `https://patents.google.com/patent/${num.startsWith('US') ? num : `US${num}`}` : '',
|
|
75
78
|
grantedAt: meta.grantDate || meta.filingDate || meta.effectiveFilingDate || '',
|
|
76
79
|
inventors: meta.firstInventorName ? [meta.firstInventorName] : [],
|
|
77
80
|
_xmlUri: xmlUri,
|
|
@@ -80,16 +83,33 @@ async function usptoSearch(opts) {
|
|
|
80
83
|
// Stage 2: parallel best-effort abstract fetch. The full-text XML carries
|
|
81
84
|
// the <abstract> element; we regex it out rather than parsing the whole
|
|
82
85
|
// document (the XML is large and we only want the abstract).
|
|
86
|
+
//
|
|
87
|
+
// Telemetry: count how many we attempted and how many succeeded so the
|
|
88
|
+
// archeologist progress log can surface "uspto abstracts: 3/5" instead of
|
|
89
|
+
// silently shipping empty `>` blockquotes to the synth agent.
|
|
90
|
+
let attempted = 0;
|
|
91
|
+
let succeeded = 0;
|
|
92
|
+
let missingUri = 0;
|
|
93
|
+
const failureCauses = [];
|
|
83
94
|
await Promise.all(stage1.map(async (r) => {
|
|
84
95
|
if (!r._xmlUri) {
|
|
96
|
+
missingUri += 1;
|
|
85
97
|
return;
|
|
86
98
|
}
|
|
99
|
+
attempted += 1;
|
|
87
100
|
try {
|
|
88
101
|
const xmlRes = await fetchImpl(r._xmlUri, {
|
|
89
102
|
method: 'GET',
|
|
90
|
-
|
|
103
|
+
// The signed XML URI lives on a CDN/storage host, not api.uspto.gov.
|
|
104
|
+
// It accepts unauthenticated GETs; sending X-API-Key actually causes
|
|
105
|
+
// some hosts to 403. Use a vanilla request with a polite User-Agent.
|
|
106
|
+
headers: {
|
|
107
|
+
accept: 'application/xml,text/xml,*/*',
|
|
108
|
+
'user-agent': 'maintainabilityai-research-runner/1.0',
|
|
109
|
+
},
|
|
91
110
|
});
|
|
92
111
|
if (!xmlRes.ok) {
|
|
112
|
+
failureCauses.push(`http${xmlRes.status}`);
|
|
93
113
|
return;
|
|
94
114
|
}
|
|
95
115
|
const xml = await xmlRes.text();
|
|
@@ -97,10 +117,20 @@ async function usptoSearch(opts) {
|
|
|
97
117
|
if (m) {
|
|
98
118
|
const stripped = m[1].replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
99
119
|
r.abstract = stripped.slice(0, 1000);
|
|
120
|
+
succeeded += 1;
|
|
100
121
|
}
|
|
122
|
+
else {
|
|
123
|
+
failureCauses.push('no-abstract-tag');
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
catch (err) {
|
|
127
|
+
failureCauses.push(err instanceof Error ? err.name : 'unknown');
|
|
101
128
|
}
|
|
102
|
-
catch { /* ignore — best-effort */ }
|
|
103
129
|
}));
|
|
130
|
+
if (records.length > 0 && process.env.RESEARCH_RUNNER_QUIET !== '1') {
|
|
131
|
+
process.stderr.write(`[research-runner] uspto abstracts: ${succeeded}/${attempted} fetched ` +
|
|
132
|
+
`(${missingUri} record(s) had no XML URI; failures: ${failureCauses.join(',') || 'none'})\n`);
|
|
133
|
+
}
|
|
104
134
|
// Drop the internal _xmlUri marker before returning.
|
|
105
135
|
const results = stage1.map(({ _xmlUri: _ignored, ...rest }) => rest);
|
|
106
136
|
return {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@maintainabilityai/research-runner",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.16",
|
|
4
4
|
"description": "Research + PRD agent runner — orchestrates the Archeologist and PRD pipelines for the MaintainabilityAI governance mesh",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "MaintainabilityAI",
|