indo-scraper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ const { fetchHTML, cheerio, parseLdJson, parseThumbnail, parseParagraphs, ok, fail } = require('../utils')
2
+
3
+ /*
4
+ * Berita terbaru Okezone
5
+ * @param {object} options - { channel: 'nasional', page: 1, limit: 20 }
6
+ * channel: nasional | economy | sports | techno | celebrity | lifestyle | otomotif | health
7
+ */
8
+ const okezone = async (options = {}) => {
9
+ return new Promise(async (resolve) => {
10
+ try {
11
+ const { channel = 'nasional', page = 1, limit = 20 } = options
12
+ const CHANNELS = {
13
+ nasional: 'nasional', economy: 'economy', sports: 'sports',
14
+ techno: 'techno', celebrity: 'celebrity', lifestyle: 'lifestyle',
15
+ otomotif: 'otomotif', health: 'health',
16
+ }
17
+ const sub = CHANNELS[channel] || 'nasional'
18
+ const url = `https://${sub}.okezone.com/indeks?page=${page}`
19
+ const html = await fetchHTML(url, { Referer: 'https://www.okezone.com' })
20
+ const $ = cheerio.load(html)
21
+ const articles = []
22
+
23
+ // Container: parent div yang berisi .box-text
24
+ $('div.box-text').each((i, el) => {
25
+ if (articles.length >= limit) return false
26
+ const $el = $(el)
27
+ const linkEl = $el.find('a.title').first()
28
+ const title = linkEl.text().trim()
29
+ const href = linkEl.attr('href') || ''
30
+ if (!title || !href) return
31
+ const container = $el.parent()
32
+ const imgEl = container.find('img').first()
33
+ articles.push({
34
+ title,
35
+ url: href.startsWith('http') ? href : `https://${sub}.okezone.com${href}`,
36
+ image: imgEl.attr('data-src') || imgEl.attr('src') || null,
37
+ category: $el.find('a.kanal').text().trim() || channel,
38
+ date: $el.find('div.timego').text().trim() || null,
39
+ source: 'okezone',
40
+ })
41
+ })
42
+ if (!articles.length) return resolve(fail('Data tidak ditemukan'))
43
+ resolve(ok(articles))
44
+ } catch (e) { console.log(e); resolve(fail(e)) }
45
+ })
46
+ }
47
+
48
+ /*
49
+ * Artikel lengkap Okezone
50
+ * @param {string} url
51
+ */
52
+ const okenewsArticle = async (url) => {
53
+ return new Promise(async (resolve) => {
54
+ try {
55
+ const html = await fetchHTML(url, { Referer: 'https://www.okezone.com' })
56
+ const $ = cheerio.load(html)
57
+ const ld = parseLdJson($, html)
58
+ const paragraphs = parseParagraphs($, '.description.read p')
59
+ resolve(ok({
60
+ title: ld?.headline || $('h1.title-content').first().text().trim(),
61
+ author: ld?.author?.name || $('.author, .nm-reporter').first().text().trim(),
62
+ date: ld?.datePublished || html.match(/"datePublished"\s*:\s*"([^"]+)"/)?.[ 1] || '',
63
+ category: $('.category').first().text().trim() || ld?.articleSection || '',
64
+ description: ld?.description || html.match(/"description"\s*:\s*"([^"]+)"/)?.[ 1] || '',
65
+ thumbnail: parseThumbnail(ld),
66
+ content: paragraphs.join('\n\n'), paragraphs,
67
+ }))
68
+ } catch (e) { console.log(e); resolve(fail(e)) }
69
+ })
70
+ }
71
+
72
+ module.exports = { okezone, okenewsArticle }
@@ -0,0 +1,73 @@
1
+ const { fetchHTML, cheerio, parseLdJson, parseThumbnail, parseParagraphs, ok, fail } = require('../utils')
2
+
3
+ /*
4
+ * Berita terbaru Republika
5
+ * @param {object} options - { channel: 'nasional', page: 1, limit: 20 }
6
+ * channel: nasional | internasional | ekonomi | olahraga | hiburan | tekno | gaya_hidup
7
+ */
8
+ const republika = async (options = {}) => {
9
+ return new Promise(async (resolve) => {
10
+ try {
11
+ const { channel = 'nasional', page = 1, limit = 20 } = options
12
+ const ch = channel === 'gaya_hidup' ? 'gaya-hidup' : channel
13
+ const url = `https://republika.co.id/berita/${ch}?page=${page}`
14
+ const html = await fetchHTML(url, { Referer: 'https://republika.co.id' })
15
+ const $ = cheerio.load(html)
16
+ const articles = []
17
+ $('.link-mobile-headline').each((i, el) => {
18
+ if (articles.length >= limit) return false
19
+ const $el = $(el)
20
+ const href = $el.attr('href') || ''
21
+ const title = $el.find('h1.card-text, h2.card-text, .card-text').first().text().trim()
22
+ if (!title || !href) return
23
+ const image = $el.find('img.lazy').first().attr('data-original') || null
24
+ const smalls = $el.find('small')
25
+ const category = smalls.filter('.text-primary').first().text().trim() || channel
26
+ const date = smalls.not('.text-primary').first().text().replace(/^-\s*/, '').trim() || null
27
+ articles.push({
28
+ title,
29
+ url: href.startsWith('http') ? href : `https://republika.co.id${href}`,
30
+ image, category, date,
31
+ source: 'republika',
32
+ })
33
+ })
34
+ if (!articles.length) return resolve(fail('Data tidak ditemukan'))
35
+ resolve(ok(articles))
36
+ } catch (e) { console.log(e); resolve(fail(e)) }
37
+ })
38
+ }
39
+
40
+ /*
41
+ * Artikel lengkap Republika
42
+ * @param {string} url
43
+ */
44
+ const republikaArticle = async (url) => {
45
+ return new Promise(async (resolve) => {
46
+ try {
47
+ const html = await fetchHTML(url, { Referer: 'https://republika.co.id' })
48
+ const $ = cheerio.load(html)
49
+ const ld = parseLdJson($, html)
50
+
51
+ // Ekstrak kategori dari subdomain URL (misal khazanah.republika.co.id → 'khazanah')
52
+ let category = ''
53
+ try {
54
+ const sub = new URL(url).hostname.split('.')[0]
55
+ if (sub !== 'republika' && sub !== 'www') category = sub
56
+ } catch (_) {}
57
+ category = category || ld?.articleSection || ''
58
+
59
+ const paragraphs = parseParagraphs($, 'p.paragraphx')
60
+ resolve(ok({
61
+ title: ld?.headline || $('h1').first().text().trim(),
62
+ author: ld?.author?.name || $('.write-by').first().text().replace(/^Oleh:\s*/i, '').trim(),
63
+ date: ld?.datePublished || html.match(/"datePublished"\s*:\s*"([^"]+)"/)?.[ 1] || '',
64
+ category,
65
+ description: ld?.description || html.match(/"description"\s*:\s*"([^"]+)"/)?.[ 1] || '',
66
+ thumbnail: parseThumbnail(ld),
67
+ content: paragraphs.join('\n\n'), paragraphs,
68
+ }))
69
+ } catch (e) { console.log(e); resolve(fail(e)) }
70
+ })
71
+ }
72
+
73
+ module.exports = { republika, republikaArticle }
@@ -0,0 +1,95 @@
1
+ const { fetchHTML, cheerio, parseLdJson, parseThumbnail, parseParagraphs, ok, fail } = require('../utils')
2
+
3
+ // Coba ambil URL gambar dari berbagai atribut lazy-load
4
+ const pickImg = ($, el) => {
5
+ const attrs = ['data-src', 'data-original', 'data-lazy', 'data-image', 'src']
6
+ for (const a of attrs) {
7
+ const v = $(el).attr(a) || ''
8
+ if (v && !v.startsWith('data:') && v.startsWith('http')) return v
9
+ }
10
+ const ss = $(el).attr('data-srcset') || $(el).attr('srcset') || ''
11
+ if (ss) return ss.trim().split(/[\s,]+/).find(s => s.startsWith('http')) || null
12
+ return null
13
+ }
14
+
15
+ /*
16
+ * Berita terbaru Tribunnews
17
+ * @param {object} options - { channel: 'nasional', page: 1, limit: 20 }
18
+ * channel: nasional | regional | internasional | sport | bisnis | seleb | lifestyle | techno | otomotif
19
+ */
20
+ const tribun = async (options = {}) => {
21
+ return new Promise(async (resolve) => {
22
+ try {
23
+ const { channel = 'nasional', page = 1, limit = 20 } = options
24
+ const url = `https://www.tribunnews.com/${channel}?page=${page}`
25
+ const html = await fetchHTML(url, { Referer: 'https://www.tribunnews.com' })
26
+ const $ = cheerio.load(html)
27
+ const articles = []
28
+ $("h3 a[href*='tribunnews.com']").each((i, el) => {
29
+ if (articles.length >= limit) return false
30
+ const $el = $(el)
31
+ const title = $el.attr('title') || $el.text().trim()
32
+ const href = $el.attr('href') || ''
33
+ if (!title || !href) return
34
+
35
+ // Naik ke container terluas: coba li/article dulu, fallback ke parent div
36
+ let container = $el.closest('li, article')
37
+ if (!container.length) container = $el.closest('div').parent()
38
+
39
+ // Cari img di seluruh container — coba semua atribut lazy-load
40
+ let image = null
41
+ container.find('img').each((_, img) => {
42
+ if (image) return false
43
+ image = pickImg($, img) || null
44
+ })
45
+
46
+ articles.push({
47
+ title, url: href, image,
48
+ category: container.find('h4 a.tsa-2').text().trim() || channel,
49
+ date: container.find('time span').text().trim() || null,
50
+ source: 'tribunnews',
51
+ })
52
+ })
53
+ if (!articles.length) return resolve(fail('Data tidak ditemukan'))
54
+ resolve(ok(articles))
55
+ } catch (e) { console.log(e); resolve(fail(e)) }
56
+ })
57
+ }
58
+
59
+ /*
60
+ * Artikel lengkap Tribunnews
61
+ * @param {string} url
62
+ */
63
+ const tribunArticle = async (url) => {
64
+ return new Promise(async (resolve) => {
65
+ try {
66
+ const html = await fetchHTML(url, { Referer: 'https://www.tribunnews.com' })
67
+ const $ = cheerio.load(html)
68
+ const ld = parseLdJson($, html)
69
+ let author = ''
70
+ if (ld?.author) author = Array.isArray(ld.author) ? ld.author[0]?.name || '' : ld.author.name || ''
71
+ if (!author) author = html.match(/'penulis'\s*:\s*'([^']+)'/)?.[1] || ''
72
+
73
+ // 1) ld+json 2) og:image (server-side, tidak lazy-load) 3) DOM fallback
74
+ let thumbnail = parseThumbnail(ld)
75
+ if (!thumbnail) thumbnail = $('meta[property="og:image"]').attr('content') || null
76
+ if (!thumbnail) {
77
+ const imgEl = $('.img-holder img, .wrap_img img, figure img, #article-body img, .photo img').first()
78
+ if (imgEl.length) thumbnail = pickImg($, imgEl) || null
79
+ }
80
+
81
+ const paragraphs = parseParagraphs($, 'div#article-body p, div.txt-article p')
82
+ resolve(ok({
83
+ title: ld?.headline || $('h1').first().text().trim(),
84
+ author,
85
+ date: ld?.datePublished || html.match(/"datePublished"\s*:\s*"([^"]+)"/)?.[1] || '',
86
+ category: ld?.articleSection || $('.breadcrumb a').eq(1).text().trim(),
87
+ description: ld?.description || html.match(/"description"\s*:\s*"([^"]+)"/)?.[1] || '',
88
+ thumbnail,
89
+ content: paragraphs.join('\n\n'), paragraphs,
90
+ }))
91
+ } catch (e) { console.log(e); resolve(fail(e)) }
92
+ })
93
+ }
94
+
95
+ module.exports = { tribun, tribunArticle }
@@ -0,0 +1,69 @@
1
+ const { axios, ok, fail } = require('../utils')
2
+
3
+ /*
4
+ * SimSimi / AI Chat
5
+ * Source: widipe.com
6
+ * @param {string} text
7
+ */
8
+
9
+ const simsimi = async (text) => {
10
+ return new Promise(async (resolve) => {
11
+ try {
12
+
13
+ if (!text)
14
+ return resolve(
15
+ fail('Text tidak boleh kosong')
16
+ )
17
+
18
+ const res = await axios.get(
19
+ 'https://widipe.com/simi',
20
+ {
21
+ params: {
22
+ text
23
+ },
24
+ headers: {
25
+ 'User-Agent':
26
+ 'Mozilla/5.0 (Linux; Android 13; Pixel 7)',
27
+
28
+ 'Accept':
29
+ 'application/json'
30
+ },
31
+ timeout: 15000
32
+ }
33
+ )
34
+
35
+ const data = res.data
36
+
37
+ /*
38
+ * parse response
39
+ */
40
+
41
+ const answer =
42
+ data.result ||
43
+ data.message ||
44
+ data.response
45
+
46
+ if (!answer)
47
+ return resolve(
48
+ fail('Tidak ada respon')
49
+ )
50
+
51
+ resolve(ok({
52
+ question: text,
53
+ answer
54
+ }))
55
+
56
+ } catch (e) {
57
+
58
+ console.log(e)
59
+
60
+ resolve(fail(
61
+ e?.response?.data ||
62
+ e?.response?.status ||
63
+ e.message
64
+ ))
65
+ }
66
+ })
67
+ }
68
+
69
+ module.exports = { simsimi }
@@ -0,0 +1,35 @@
1
+ const { ok, fail } = require('../utils')
2
+
3
+ const DEVICES = {
4
+ mobile: { width: 390, height: 844, mobile: true },
5
+ tablet: { width: 768, height: 1024, mobile: true },
6
+ desktop: { width: 1920, height: 1080, mobile: false },
7
+ hd: { width: 2560, height: 1440, mobile: false },
8
+ '4k': { width: 3840, height: 2160, mobile: false },
9
+ }
10
+
11
+ /*
12
+ * Website Screenshot via Microlink API
13
+ * @param {string} url
14
+ * @param {string} device - mobile | tablet | desktop | hd | 4k
15
+ */
16
+ const ssweb = async (url, device = 'desktop') => {
17
+ return new Promise(async (resolve) => {
18
+ try {
19
+ if (!url) return resolve(fail('URL tidak boleh kosong'))
20
+ if (!/^https?:\/\//.test(url)) url = 'https://' + url
21
+
22
+ const { width, height, mobile } = DEVICES[device.toLowerCase()] || DEVICES.desktop
23
+
24
+ const screenshot = 'https://api.microlink.io/?' + new URLSearchParams({
25
+ url, screenshot: 'true', meta: 'false', embed: 'screenshot.url',
26
+ viewport: `${width}x${height}`, deviceScaleFactor: '1',
27
+ mobile: String(mobile), fullPage: 'true',
28
+ })
29
+
30
+ resolve(ok({ url, device, resolution: `${width}x${height}`, screenshot }))
31
+ } catch (e) { console.log(e); resolve(fail(e)) }
32
+ })
33
+ }
34
+
35
+ module.exports = { ssweb }
package/src/utils.js ADDED
@@ -0,0 +1,79 @@
1
+ const axios = require('axios')
2
+ const cheerio = require('cheerio')
3
+
4
+ // ── Headers ───────────────────────────────────────────────────────────────────
5
+
6
+ const HEADERS = {
7
+ 'User-Agent': 'Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.82 Mobile Safari/537.36',
8
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
9
+ 'Accept-Language': 'id-ID,id;q=0.9,en-US;q=0.8',
10
+ 'Accept-Encoding': 'gzip, deflate, br',
11
+ 'Sec-Fetch-Dest': 'document',
12
+ 'Sec-Fetch-Mode': 'navigate',
13
+ 'Sec-Fetch-Site': 'none',
14
+ 'Sec-Fetch-User': '?1',
15
+ 'Upgrade-Insecure-Requests': '1',
16
+ }
17
+
18
+ async function fetchHTML(url, headers = {}) {
19
+ let last
20
+ for (let i = 0; i <= 2; i++) {
21
+ try {
22
+ const res = await axios.get(url, { headers: { ...HEADERS, ...headers }, timeout: 10000, responseType: 'text' })
23
+ return res.data
24
+ } catch (e) {
25
+ last = e
26
+ await new Promise(r => setTimeout(r, 500 * (i + 1)))
27
+ }
28
+ }
29
+ throw new Error(`Gagal fetch ${url}: ${last.message}`)
30
+ }
31
+
32
+ async function fetchJSON(url, headers = {}) {
33
+ const res = await axios.get(url, {
34
+ headers: {
35
+ 'User-Agent': 'Mozilla/5.0 (Linux; Android 13) AppleWebKit/537.36 Chrome/124.0 Mobile Safari/537.36',
36
+ 'Accept': 'application/json, text/plain, */*',
37
+ 'Accept-Language': 'id-ID,id;q=0.9',
38
+ 'Referer': 'https://www.bmkg.go.id/',
39
+ 'Origin': 'https://www.bmkg.go.id',
40
+ ...headers
41
+ },
42
+ timeout: 10000
43
+ })
44
+ return res.data
45
+ }
46
+
47
+ function parseLdJson($, html) {
48
+ let ld = null
49
+ $('script[type="application/ld+json"]').each((_, el) => {
50
+ try { const j = JSON.parse($(el).html()); if (j.datePublished) ld = j } catch (_) {}
51
+ })
52
+ return ld
53
+ }
54
+
55
+ function parseThumbnail(ld) {
56
+ if (!ld?.image) return null
57
+ if (typeof ld.image === 'string') return ld.image
58
+ if (ld.image.url) return ld.image.url
59
+ if (Array.isArray(ld.image)) return ld.image[0]?.url || ld.image[0] || null
60
+ return null
61
+ }
62
+
63
+ function parseParagraphs($, selector) {
64
+ const SKIP = /^(baca juga|simak juga|artikel terkait|lihat juga|advertisement|iklan)/i
65
+ const STRIP = /baca juga\s*:.*?(?=\n|$)/gi
66
+ const result = []
67
+ $(selector).each((_, el) => {
68
+ let text = $(el).text().trim().replace(STRIP, '').trim()
69
+ if (text.length > 30 && !SKIP.test(text)) result.push(text)
70
+ })
71
+ return result
72
+ }
73
+
74
+ // ── Responses ─────────────────────────────────────────────────────────────────
75
+
76
+ const ok = (data) => ({ creator: global.creator, status: true, data })
77
+ const fail = (e) => ({ creator: global.creator, status: false, msg: e instanceof Error ? e.message : String(e) })
78
+
79
+ module.exports = { axios, cheerio, HEADERS, fetchHTML, fetchJSON, parseLdJson, parseThumbnail, parseParagraphs, ok, fail }