@miso.ai/server-wordpress 0.6.3-beta.11 → 0.6.3-beta.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,120 @@
1
+ import { createWriteStream } from 'fs';
2
+ import { access, mkdir } from 'fs/promises';
3
+ import { createGzip } from 'zlib';
4
+ import { startOfDate, endOfDate, stream } from '@miso.ai/server-commons';
5
+ import { WordPressClient } from '../src/index.js';
6
+ import { getFirstPostDate, getLastPostDate, getYear, buildForEntities } from './utils.js';
7
+
8
+ function build(yargs) {
9
+ return buildForEntities(yargs);
10
+ }
11
+
12
+ async function run({
13
+ destination = './data',
14
+ batchSize = 30000,
15
+ ...options
16
+ } = {}) {
17
+ const client = new WordPressClient(options);
18
+
19
+ const [firstPostDate, lastPostDate] = await Promise.all([
20
+ getFirstPostDate(client),
21
+ getLastPostDate(client),
22
+ ]);
23
+ const firstPostYear = getYear(firstPostDate);
24
+ const lastPostYear = getYear(lastPostDate);
25
+
26
+ // divide into batches
27
+ const batches = [];
28
+ let endYear, sum = 0;
29
+ for (let year = lastPostYear; year >= firstPostYear; year--) {
30
+ if (endYear === undefined) {
31
+ endYear = year;
32
+ }
33
+ const after = startOfDate(year);
34
+ const before = endOfDate(year);
35
+ const count = await client.posts.count({ ...options, after, before });
36
+ sum += count;
37
+ if (sum >= batchSize) {
38
+ batches.push({ start: year, end: endYear, records: sum });
39
+ endYear = undefined;
40
+ sum = 0;
41
+ }
42
+ }
43
+ if (sum > 0) {
44
+ batches.push({ start: firstPostYear, end: endYear, records: sum });
45
+ }
46
+ const batchCount = batches.length;
47
+ if (batchCount === 0) {
48
+ console.log('No posts found.');
49
+ return;
50
+ }
51
+ // merge last two batches if last batch is too small
52
+ if (batches[batchCount - 1].records < batchSize * 0.2) {
53
+ const last = batches.pop();
54
+ batches[batchCount - 2].end = last.end;
55
+ batches[batchCount - 2].records += last.records;
56
+ }
57
+ console.log(`Divide into ${batchCount} batches:`);
58
+ for (const { start, end, records } of batches) {
59
+ console.log(`- ${start} -> ${end} (${records} records)`);
60
+ }
61
+
62
+ // mkdir -p
63
+ try {
64
+ await access(destination);
65
+ } catch (err) {
66
+ if (err.code !== 'ENOENT') {
67
+ throw err;
68
+ }
69
+ await mkdir(destination, { recursive: true });
70
+ console.log(`Created directory ${destination}`);
71
+ }
72
+
73
+ // download
74
+ let index = 0;
75
+ for (const batch of batches) {
76
+ const { start, end, records } = batch;
77
+ const after = startOfDate(start);
78
+ const before = endOfDate(end);
79
+ const filename = `${options.transform ? 'miso' : 'wp'}-posts.${start}-${end}.jsonl.gz`;
80
+
81
+ console.log(`[${index + 1} / ${batchCount}] Downloading ${filename}`);
82
+
83
+ const startTime = Date.now();
84
+ const sourceStream = await client.posts.stream({ ...options, after, before });
85
+
86
+ await stream.pipeline(
87
+ sourceStream,
88
+ stream.stringify(),
89
+ createGzip(),
90
+ createWriteStream(`${destination}/${filename}`),
91
+ );
92
+
93
+ const elapsed = Date.now() - startTime;
94
+ console.log(`[${index + 1} / ${batchCount}] Downloaded ${filename} (${records} records in ${formatDuration(elapsed)})`);
95
+ index++;
96
+ }
97
+
98
+ console.log('Done.');
99
+ }
100
+
101
+ function formatDuration(duration) {
102
+ const seconds = Math.floor(duration / 1000);
103
+ const minutes = Math.floor(seconds / 60);
104
+ const hours = Math.floor(minutes / 60);
105
+ if (hours > 0) {
106
+ return `${hours}h${minutes % 60}m${seconds % 60}s`;
107
+ }
108
+ if (minutes > 0) {
109
+ return `${minutes}m${seconds % 60}s`;
110
+ }
111
+ return `${seconds}s`;
112
+ }
113
+
114
+ export default {
115
+ command: 'download',
116
+ aliases: ['down'],
117
+ desc: 'Download all posts and save as files.',
118
+ builder: build,
119
+ handler: run,
120
+ };
package/cli/entities.js CHANGED
@@ -2,62 +2,7 @@ import { Transform } from 'stream';
2
2
  import split2 from 'split2';
3
3
  import { stream, parseDuration } from '@miso.ai/server-commons';
4
4
  import { WordPressClient } from '../src/index.js';
5
- import { normalizeOptions, normalizeTransform } from './utils.js';
6
-
7
- export function buildForEntities(yargs) {
8
- // TODO: make them mutually exclusive
9
- return yargs
10
- .option('terms', {
11
- describe: 'Display terms associated with this type of resource',
12
- type: 'boolean',
13
- })
14
- .option('count', {
15
- alias: 'c',
16
- describe: 'Return the total number of records',
17
- type: 'boolean',
18
- })
19
- .option('date', {
20
- alias: 'd',
21
- describe: 'Only include records in this year/month/day',
22
- })
23
- .option('after', {
24
- alias: 'a',
25
- describe: 'Only include records after this time',
26
- })
27
- .option('before', {
28
- alias: 'b',
29
- describe: 'Only include records before this time',
30
- })
31
- .option('update', {
32
- alias: 'u',
33
- describe: 'Only include records modified in given duration (3h, 2d, etc.)',
34
- })
35
- .option('ids', {
36
- alias: 'include',
37
- describe: 'Specify post ids',
38
- })
39
- .option('fields', {
40
- describe: 'Specify which record fields are retrieved',
41
- type: 'array',
42
- coerce: yargs.coerceToArray,
43
- })
44
- .option('resolve', {
45
- alias: 'r',
46
- describe: 'Attach resolved entities (author, catagories) linked with the subjects',
47
- type: 'boolean',
48
- })
49
- .option('transform', {
50
- alias: 't',
51
- describe: 'Apply transform function to the entities',
52
- });
53
- /*
54
- .option('limit', {
55
- alias: 'n',
56
- describe: 'Limit the amount of records',
57
- type: 'number',
58
- })
59
- */
60
- }
5
+ import { normalizeOptions, buildForEntities } from './utils.js';
61
6
 
62
7
  function build(yargs) {
63
8
  return buildForEntities(yargs)
@@ -110,7 +55,7 @@ export async function runGet(client, name, { transform, ...options }) {
110
55
  await stream.pipelineToStdout(
111
56
  await client.entities(name).stream({
112
57
  ...options,
113
- transform: await normalizeTransform(transform),
58
+ transform,
114
59
  }),
115
60
  stream.stringify(),
116
61
  );
@@ -160,7 +105,6 @@ async function buildUpdateStream(client, name, update, {
160
105
  ...options
161
106
  }) {
162
107
  // TODO: move the logic into client itself
163
- transform = await normalizeTransform(transform);
164
108
  const now = Date.now();
165
109
  update = parseDuration(update);
166
110
  const threshold = now - update;
package/cli/index.js CHANGED
@@ -4,6 +4,8 @@ import version from '../src/version.js';
4
4
  import { profile, init } from './profile.js';
5
5
  import taxonomies from './taxonomies.js';
6
6
  import entities from './entities.js';
7
+ import summarize from './summarize.js';
8
+ import download from './download.js';
7
9
 
8
10
  yargs.build(yargs => {
9
11
  yargs
@@ -26,6 +28,8 @@ yargs.build(yargs => {
26
28
  .hide('debug')
27
29
  .command(init)
28
30
  .command(profile)
31
+ .command(summarize)
32
+ .command(download)
29
33
  .command(taxonomies)
30
34
  .command(entities)
31
35
  .version(version);
@@ -0,0 +1,59 @@
1
+ import { startOfDate, endOfDate } from '@miso.ai/server-commons';
2
+ import { WordPressClient } from '../src/index.js';
3
+ import { getFirstPostDate, getLastPostDate, getYear } from './utils.js';
4
+
5
+ function build(yargs) {
6
+ return yargs;
7
+ }
8
+
9
+ async function run({ ...options } = {}) {
10
+ const client = new WordPressClient(options);
11
+ const [total, firstPostDate, lastPostDate] = await Promise.all([
12
+ client.posts.count(options),
13
+ getFirstPostDate(client),
14
+ getLastPostDate(client),
15
+ ]);
16
+ const totalStrLength = `${total}`.length;
17
+ console.log();
18
+ console.log(`Total posts: ${total}`);
19
+ console.log(`First post at: ${firstPostDate}`);
20
+ console.log(`Last post at: ${lastPostDate}`);
21
+
22
+ // drill down by year
23
+ console.log();
24
+ const bar = `| ---- | ${'-'.repeat(totalStrLength)} |`;
25
+ console.log(bar);
26
+ console.log(`| Year | ${'Posts'.padStart(totalStrLength)} |`);
27
+ console.log(bar);
28
+ for (let year = getYear(firstPostDate), lastYear = getYear(lastPostDate); year <= lastYear; year++) {
29
+ const after = startOfDate(year);
30
+ const before = endOfDate(year);
31
+ const count = await client.posts.count({ ...options, after, before });
32
+ console.log(`| ${year} | ${`${count}`.padStart(totalStrLength)} |`);
33
+ }
34
+ console.log(bar);
35
+ }
36
+
37
+ function printTable(arr) {
38
+ arr = arr.map((row) => row.map(str));
39
+ const colWidths = arr[0].map((_, i) => Math.max(...arr.map((row) => (row[i] || '').length)));
40
+ for (const row of arr) {
41
+ console.log(row.map((v, i) => rightPad(v, colWidths[i])).join(' '));
42
+ }
43
+ }
44
+
45
+ function str(value) {
46
+ return value === undefined ? '--' : `${value}`;
47
+ }
48
+
49
+ function rightPad(str = '', length) {
50
+ return str.padEnd(length);
51
+ }
52
+
53
+ export default {
54
+ command: 'summarize',
55
+ aliases: ['sum'],
56
+ desc: 'Print out a summary of the WordPress site',
57
+ builder: build,
58
+ handler: run,
59
+ };
package/cli/utils.js CHANGED
@@ -1,8 +1,5 @@
1
- import { join } from 'path';
2
1
  import { startOfDate, endOfDate } from '@miso.ai/server-commons';
3
2
 
4
- const PWD = process.env.PWD;
5
-
6
3
  export function normalizeOptions({ date, after, before, ids, include, ...options }) {
7
4
  [after, before] = [startOfDate(date || after), endOfDate(date || before)];
8
5
  // TODO: rely on yargs to coerce to array
@@ -10,16 +7,75 @@ export function normalizeOptions({ date, after, before, ids, include, ...options
10
7
  return { ...options, after, before, ids };
11
8
  }
12
9
 
13
- export async function normalizeTransform(transform) {
14
- if (typeof transform === 'string') {
15
- if (transform === 'default' || transform === 'legacy') {
16
- return transform;
17
- }
18
- return (await import(join(PWD, transform))).default;
19
- }
20
- return !!transform;
21
- }
22
-
23
10
  export function parseDate(value) {
24
11
  return Date.parse(`${value}Z`);
25
12
  }
13
+
14
+ export async function getFirstPostDate(client, options) {
15
+ return getPostDate(client, 'asc', options);
16
+ }
17
+
18
+ export async function getLastPostDate(client, options) {
19
+ return getPostDate(client, 'desc', options);
20
+ }
21
+
22
+ async function getPostDate(client, order, options = {}) {
23
+ return (await client.posts.getAll({ ...options, limit: 1, order, fields: ['date_gmt'] }))[0].date_gmt;
24
+ }
25
+
26
+ export function getYear(dateStr) {
27
+ return new Date(dateStr).getFullYear();
28
+ }
29
+
30
+ export function buildForEntities(yargs) {
31
+ // TODO: make them mutually exclusive
32
+ return yargs
33
+ .option('terms', {
34
+ describe: 'Display terms associated with this type of resource',
35
+ type: 'boolean',
36
+ })
37
+ .option('count', {
38
+ alias: 'c',
39
+ describe: 'Return the total number of records',
40
+ type: 'boolean',
41
+ })
42
+ .option('date', {
43
+ alias: 'd',
44
+ describe: 'Only include records in this year/month/day',
45
+ })
46
+ .option('after', {
47
+ alias: 'a',
48
+ describe: 'Only include records after this time',
49
+ })
50
+ .option('before', {
51
+ alias: 'b',
52
+ describe: 'Only include records before this time',
53
+ })
54
+ .option('update', {
55
+ alias: 'u',
56
+ describe: 'Only include records modified in given duration (3h, 2d, etc.)',
57
+ })
58
+ .option('ids', {
59
+ alias: 'include',
60
+ describe: 'Specify post ids',
61
+ })
62
+ .option('fields', {
63
+ describe: 'Specify which record fields are retrieved',
64
+ type: 'array',
65
+ coerce: yargs.coerceToArray,
66
+ })
67
+ .option('resolve', {
68
+ alias: 'r',
69
+ describe: 'Attach resolved entities (author, catagories) linked with the subjects',
70
+ type: 'boolean',
71
+ })
72
+ .option('transform', {
73
+ alias: 't',
74
+ describe: 'Apply transform function to the entities',
75
+ })
76
+ .option('limit', {
77
+ alias: 'n',
78
+ describe: 'Limit the amount of records',
79
+ type: 'number',
80
+ });
81
+ }
package/package.json CHANGED
@@ -17,9 +17,9 @@
17
17
  "simonpai <simon.pai@askmiso.com>"
18
18
  ],
19
19
  "dependencies": {
20
- "@miso.ai/server-commons": "0.6.3-beta.11",
20
+ "@miso.ai/server-commons": "0.6.3-beta.13",
21
21
  "axios": "^1.6.2",
22
22
  "axios-retry": "^3.3.1"
23
23
  },
24
- "version": "0.6.3-beta.11"
24
+ "version": "0.6.3-beta.13"
25
25
  }
package/src/client.js CHANGED
@@ -70,7 +70,7 @@ export default class WordPressClient {
70
70
 
71
71
  }
72
72
 
73
- const SITE_PROFILE_PROPS = ['site', 'utcOffset', 'resources'];
73
+ const SITE_PROFILE_PROPS = ['site', 'utcOffset', 'resources', 'defaults'];
74
74
 
75
75
  class SiteProfile {
76
76
 
@@ -50,7 +50,7 @@ export default class EntityIndex {
50
50
  if (this.hierarchical) {
51
51
  return; // already all fetched
52
52
  }
53
- ids = asArray(ids);
53
+ ids = asArray(ids).filter(id => id); // discard 0, null, undefined
54
54
 
55
55
  const promises = []
56
56
  const idsToFetch = [];
@@ -85,8 +85,11 @@ export default class EntityIndex {
85
85
  }
86
86
 
87
87
  _resolveFetch(id) {
88
- this._fetching.get(id).resolve();
89
- this._fetching.delete(id);
88
+ const res = this._fetching.get(id);
89
+ if (res) {
90
+ res.resolve();
91
+ this._fetching.delete(id);
92
+ }
90
93
  }
91
94
 
92
95
  async get(id) {
@@ -96,13 +99,14 @@ export default class EntityIndex {
96
99
  }
97
100
 
98
101
  async getAll(ids) {
102
+ ids = ids.filter(id => id); // discard 0, null, undefined
99
103
  await this._dataReady();
100
104
  await this.fetch(ids);
101
105
  return ids.map(id => this._index.get(id));
102
106
  }
103
107
 
104
108
  async getValue(id) {
105
- if (id === undefined) {
109
+ if (!id) { // 0, null, undefined
106
110
  return undefined;
107
111
  }
108
112
  return this._value(await this.get(id));
@@ -1,3 +1,4 @@
1
+ import { join } from 'path';
1
2
  import { Transform } from 'stream';
2
3
  import { asArray, stream } from '@miso.ai/server-commons';
3
4
  import EntityIndex from './entity-index.js';
@@ -20,9 +21,8 @@ export default class Entities {
20
21
  if (!resolve && !transform) {
21
22
  return this._client._helpers.stream(this.name, options);
22
23
  }
23
- transform = getTransformFn(transform);
24
-
25
24
  const client = this._client;
25
+ transform = await getTransformFn(client, this.name, transform);
26
26
 
27
27
  // we need taxonomy fetched so we know whether it's hierarchical
28
28
  const taxonomies = await client._helpers.findAssociatedTaxonomies(this.name);
@@ -115,8 +115,26 @@ function aggregateIds(records, propName) {
115
115
  }, new Set()));
116
116
  }
117
117
 
118
- function getTransformFn(transform) {
119
- return typeof transform === 'function' ? post => transform(post, { defaultTransform }) :
120
- (transform === true || transform === 'default') ? defaultTransform :
121
- transform === 'legacy' ? legacyTransform : undefined;
118
+ async function getTransformFn(client, name, transform) {
119
+ switch (transform) {
120
+ case 'default':
121
+ return defaultTransform;
122
+ case 'legacy':
123
+ return legacyTransform;
124
+ }
125
+ if (transform === true) {
126
+ const { defaults } = client._profile || {};
127
+ if (!defaults || !defaults.transform || !defaults.transform[name]) {
128
+ return defaultTransform;
129
+ }
130
+ transform = defaults.transform[name];
131
+ }
132
+ if (typeof transform === 'string') {
133
+ // try as file path
134
+ transform = (await import(join(process.env.PWD, transform))).default;
135
+ }
136
+ if (typeof transform === 'function') {
137
+ return post => transform(post, { defaultTransform });
138
+ }
139
+ return undefined;
122
140
  }
package/src/helpers.js CHANGED
@@ -6,7 +6,7 @@ import version from './version.js';
6
6
 
7
7
  const MS_PER_HOUR = 1000 * 60 * 60;
8
8
 
9
- const STREAM_OPTIONS = ['offset', 'limit', 'strategy', 'filter', 'transform', 'onLoad'];
9
+ const STREAM_OPTIONS = ['offset', 'strategy', 'filter', 'transform', 'onLoad'];
10
10
 
11
11
  function createAxios(client) {
12
12
  const { auth } = client._options || {};
@@ -10,8 +10,12 @@ export default class Posts extends Entities {
10
10
  super(client, RESOURCE_NAME);
11
11
  }
12
12
 
13
- async getAll() {
14
- throw new Error(`Getting all posts is not supported.`);
13
+ async getAll(options = {}) {
14
+ if (!options.ids && !options.limit) {
15
+ // TODO: should be more tolerant
16
+ throw new Error(`Getting all posts is not supported.`);
17
+ }
18
+ return super.getAll(options);
15
19
  }
16
20
 
17
21
  async index() {
@@ -30,10 +30,10 @@ export default class WordPressDataSource {
30
30
  this._debug(`[WordPressDataSource] request ${url}`);
31
31
  const response = await this._axiosGet(url);
32
32
  this._debug(`[WordPressDataSource] response ${response.status} ${url}`);
33
- return this._process(response, { url });
33
+ return this._process(response, { request, url });
34
34
  }
35
35
 
36
- _process({ status, data }, { url }) {
36
+ _process({ status, data }, { request, url }) {
37
37
  if (status >= 400 && status < 500 && data.code === 'rest_post_invalid_page_number') {
38
38
  // out of bound, so there is no more data
39
39
  return { data: [], terminate: true };
@@ -41,6 +41,10 @@ export default class WordPressDataSource {
41
41
  if (!Array.isArray(data)) {
42
42
  throw new Error(`Unexpected response from WordPress API for ${url}. Expected an array of objects: ${data}`);
43
43
  }
44
+ const { records } = request;
45
+ if (records) {
46
+ data = data.slice(0, records);
47
+ }
44
48
  if (!this._options.preserveLinks) {
45
49
  data = data.map(this._helpers.removeLinks);
46
50
  }
@@ -14,7 +14,7 @@ export default class PagedWordPressDataSource extends WordPressDataSource {
14
14
  if (pageSize > MAX_PAGE_SIZE) {
15
15
  throw new Error(`Page size cannot be greater than ${MAX_PAGE_SIZE}: ${pageSize}`);
16
16
  }
17
- // TODO: limit
17
+ this._limit = limit;
18
18
  this._pageSize = options.pageSize = pageSize;
19
19
  this._page = 0;
20
20
  }
@@ -25,10 +25,13 @@ export default class PagedWordPressDataSource extends WordPressDataSource {
25
25
 
26
26
  request() {
27
27
  const page = this._page++;
28
- const records = this._pageSize;
29
- const total = this._totalValue;
28
+ let records = this._pageSize;
29
+ const limit = combineLimit(this._totalValue, this._limit);
30
30
  // if we know total, we know when the data is exhausted
31
- const exhaust = total !== undefined && ((page + 1) * this._pageSize > total + 10); // 10 for a buffer
31
+ const exhaust = limit !== undefined && ((page + 1) * this._pageSize > limit);
32
+ if (exhaust && this._limit !== undefined) {
33
+ records = this._limit - (page * this._pageSize);
34
+ }
32
35
  return exhaust ? { records, page, exhaust } : { records, page };
33
36
  }
34
37
 
@@ -36,9 +39,18 @@ export default class PagedWordPressDataSource extends WordPressDataSource {
36
39
  return this._totalPromise || (this._totalPromise = this._fetchTotal());
37
40
  }
38
41
 
39
- async _url(baseUrl, { page }) {
42
+ async _url(baseUrl, { records, page }) {
40
43
  const head = baseUrl.indexOf('?') < 0 ? '?' : '&';
41
- return `${baseUrl}${head}page=${page + 1}`;
44
+ let url = `${baseUrl}${head}page=${page + 1}`;
45
+ // optimize: if limit < page size we can save much bandwidth
46
+ if (page === 0 && records < this._pageSize) {
47
+ if (url.indexOf('per_page=') > -1) {
48
+ url = url.replace(/per_page=\d+/, `per_page=${records}`);
49
+ } else {
50
+ url += `&per_page=${records}`;
51
+ }
52
+ }
53
+ return url;
42
54
  }
43
55
 
44
56
  async _fetchTotal() {
@@ -63,3 +75,9 @@ export default class PagedWordPressDataSource extends WordPressDataSource {
63
75
  }
64
76
 
65
77
  }
78
+
79
+ const TOTAL_BUFFER = 10;
80
+
81
+ function combineLimit(total, limit) {
82
+ return total === undefined ? limit : limit === undefined ? total + TOTAL_BUFFER : Math.min(total + TOTAL_BUFFER, limit);
83
+ }
package/src/version.js CHANGED
@@ -1 +1 @@
1
- export default '0.6.3-beta.11';
1
+ export default '0.6.3-beta.13';