@mixpeek/prebid 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,142 @@
1
+ /**
2
+ * Mixpeek Context Adapter - Image Content Extractor
3
+ * @module extractors/imageExtractor
4
+ */
5
+
6
+ import { sanitizeText } from '../utils/helpers.js'
7
+ import logger from '../utils/logger.js'
8
+
9
+ /**
10
+ * Extract primary images from the page
11
+ * @param {number} maxImages - Maximum number of images to extract
12
+ * @returns {array} Array of image objects
13
+ */
14
+ export function extractImages(maxImages = 5) {
15
+ logger.time('extractImages')
16
+
17
+ try {
18
+ const images = findPrimaryImages()
19
+ const extracted = images.slice(0, maxImages).map(img => ({
20
+ src: img.src,
21
+ alt: sanitizeText(img.alt || ''),
22
+ title: sanitizeText(img.title || ''),
23
+ width: img.naturalWidth || img.width || 0,
24
+ height: img.naturalHeight || img.height || 0,
25
+ aspectRatio: calculateAspectRatio(img)
26
+ }))
27
+
28
+ logger.timeEnd('extractImages')
29
+ logger.info(`Extracted ${extracted.length} images`)
30
+
31
+ return extracted
32
+ } catch (error) {
33
+ logger.error('Error extracting images:', error)
34
+ logger.timeEnd('extractImages')
35
+ return []
36
+ }
37
+ }
38
+
39
+ /**
40
+ * Find primary/hero images on the page
41
+ * @private
42
+ * @returns {array} Array of image elements
43
+ */
44
+ function findPrimaryImages() {
45
+ const images = Array.from(document.querySelectorAll('img'))
46
+
47
+ return images
48
+ .filter(img => {
49
+ // Filter out small images (likely icons/thumbnails)
50
+ const width = img.naturalWidth || img.width || 0
51
+ const height = img.naturalHeight || img.height || 0
52
+ return width >= 200 && height >= 200
53
+ })
54
+ .filter(img => {
55
+ // Filter out hidden images
56
+ const style = window.getComputedStyle(img)
57
+ return style.display !== 'none' && style.visibility !== 'hidden'
58
+ })
59
+ .filter(img => {
60
+ // Filter out ad images
61
+ const isAd = img.closest('.ad, .advertisement, [id*="ad-"], [class*="ad-"]')
62
+ return !isAd
63
+ })
64
+ .sort((a, b) => {
65
+ // Sort by size (largest first)
66
+ const aSize = (a.naturalWidth || a.width || 0) * (a.naturalHeight || a.height || 0)
67
+ const bSize = (b.naturalWidth || b.width || 0) * (b.naturalHeight || b.height || 0)
68
+ return bSize - aSize
69
+ })
70
+ }
71
+
72
+ /**
73
+ * Calculate aspect ratio
74
+ * @private
75
+ * @param {HTMLImageElement} img - Image element
76
+ * @returns {number} Aspect ratio
77
+ */
78
+ function calculateAspectRatio(img) {
79
+ const width = img.naturalWidth || img.width || 0
80
+ const height = img.naturalHeight || img.height || 0
81
+ return height > 0 ? width / height : 0
82
+ }
83
+
84
+ /**
85
+ * Extract Open Graph image
86
+ * @returns {object|null} OG image data
87
+ */
88
+ export function extractOGImage() {
89
+ const ogImage = document.querySelector('meta[property="og:image"]')
90
+ if (ogImage) {
91
+ return {
92
+ src: ogImage.content,
93
+ alt: document.querySelector('meta[property="og:image:alt"]')?.content || '',
94
+ width: parseInt(document.querySelector('meta[property="og:image:width"]')?.content || '0'),
95
+ height: parseInt(document.querySelector('meta[property="og:image:height"]')?.content || '0')
96
+ }
97
+ }
98
+ return null
99
+ }
100
+
101
+ /**
102
+ * Extract featured/hero image
103
+ * @returns {object|null} Featured image data
104
+ */
105
+ export function extractFeaturedImage() {
106
+ // Check for common featured image patterns
107
+ const selectors = [
108
+ '[class*="featured-image"]',
109
+ '[class*="hero-image"]',
110
+ '[class*="header-image"]',
111
+ 'article img:first-of-type',
112
+ '.post-thumbnail img'
113
+ ]
114
+
115
+ for (const selector of selectors) {
116
+ const img = document.querySelector(selector)
117
+ if (img) {
118
+ return {
119
+ src: img.src,
120
+ alt: sanitizeText(img.alt || ''),
121
+ width: img.naturalWidth || img.width || 0,
122
+ height: img.naturalHeight || img.height || 0
123
+ }
124
+ }
125
+ }
126
+
127
+ return null
128
+ }
129
+
130
+ /**
131
+ * Check if page has significant image content
132
+ * @returns {boolean}
133
+ */
134
+ export function hasImages() {
135
+ const images = document.querySelectorAll('img')
136
+ return Array.from(images).some(img => {
137
+ const width = img.naturalWidth || img.width || 0
138
+ const height = img.naturalHeight || img.height || 0
139
+ return width >= 200 && height >= 200
140
+ })
141
+ }
142
+
@@ -0,0 +1,196 @@
1
+ /**
2
+ * Mixpeek Context Adapter - Page Content Extractor
3
+ * @module extractors/pageExtractor
4
+ */
5
+
6
+ import { PERFORMANCE } from '../config/constants.js'
7
+ import { sanitizeText, truncateText, extractDomain } from '../utils/helpers.js'
8
+ import logger from '../utils/logger.js'
9
+
10
+ /**
11
+ * Extract content from the current page
12
+ * @returns {object} Extracted page content
13
+ */
14
+ export function extractPageContent() {
15
+ logger.time('extractPageContent')
16
+
17
+ try {
18
+ const content = {
19
+ url: window.location.href,
20
+ domain: extractDomain(window.location.href),
21
+ title: document.title,
22
+ description: extractMetaDescription(),
23
+ text: extractBodyText(),
24
+ keywords: extractMetaKeywords(),
25
+ ogTags: extractOpenGraphTags(),
26
+ structuredData: extractStructuredData(),
27
+ language: document.documentElement.lang || 'en'
28
+ }
29
+
30
+ logger.timeEnd('extractPageContent')
31
+ logger.info('Extracted page content:', {
32
+ url: content.url,
33
+ textLength: content.text.length,
34
+ keywords: content.keywords.length
35
+ })
36
+
37
+ return content
38
+ } catch (error) {
39
+ logger.error('Error extracting page content:', error)
40
+ logger.timeEnd('extractPageContent')
41
+ return null
42
+ }
43
+ }
44
+
45
+ /**
46
+ * Extract meta description
47
+ * @private
48
+ * @returns {string} Meta description
49
+ */
50
+ function extractMetaDescription() {
51
+ const metaDesc = document.querySelector('meta[name="description"]') ||
52
+ document.querySelector('meta[property="og:description"]')
53
+ return metaDesc ? sanitizeText(metaDesc.content) : ''
54
+ }
55
+
56
+ /**
57
+ * Extract meta keywords
58
+ * @private
59
+ * @returns {array} Keywords array
60
+ */
61
+ function extractMetaKeywords() {
62
+ const metaKeywords = document.querySelector('meta[name="keywords"]')
63
+ if (metaKeywords) {
64
+ return metaKeywords.content.split(',').map(k => k.trim()).filter(Boolean)
65
+ }
66
+ return []
67
+ }
68
+
69
+ /**
70
+ * Extract body text content
71
+ * @private
72
+ * @returns {string} Body text
73
+ */
74
+ function extractBodyText() {
75
+ // Remove script, style, and other non-content elements
76
+ const clone = document.body.cloneNode(true)
77
+ const elementsToRemove = clone.querySelectorAll('script, style, iframe, nav, footer, aside, .ad, .advertisement')
78
+ elementsToRemove.forEach(el => el.remove())
79
+
80
+ // Get text content
81
+ const text = clone.textContent || clone.innerText || ''
82
+
83
+ // Sanitize and truncate
84
+ return truncateText(sanitizeText(text), PERFORMANCE.MAX_CONTENT_SIZE)
85
+ }
86
+
87
+ /**
88
+ * Extract Open Graph tags
89
+ * @private
90
+ * @returns {object} Open Graph data
91
+ */
92
+ function extractOpenGraphTags() {
93
+ const ogTags = {}
94
+ const metaTags = document.querySelectorAll('meta[property^="og:"]')
95
+
96
+ metaTags.forEach(tag => {
97
+ const property = tag.getAttribute('property').replace('og:', '')
98
+ ogTags[property] = tag.content
99
+ })
100
+
101
+ return ogTags
102
+ }
103
+
104
+ /**
105
+ * Extract structured data (JSON-LD)
106
+ * @private
107
+ * @returns {array} Structured data objects
108
+ */
109
+ function extractStructuredData() {
110
+ const structuredData = []
111
+ const scripts = document.querySelectorAll('script[type="application/ld+json"]')
112
+
113
+ scripts.forEach(script => {
114
+ try {
115
+ const data = JSON.parse(script.textContent)
116
+ structuredData.push(data)
117
+ } catch (e) {
118
+ logger.warn('Failed to parse structured data:', e)
119
+ }
120
+ })
121
+
122
+ return structuredData
123
+ }
124
+
125
+ /**
126
+ * Extract article-specific content
127
+ * @returns {object|null} Article content
128
+ */
129
+ export function extractArticleContent() {
130
+ try {
131
+ const article = document.querySelector('article') ||
132
+ document.querySelector('[role="article"]') ||
133
+ document.querySelector('.article') ||
134
+ document.querySelector('.post')
135
+
136
+ if (!article) return null
137
+
138
+ return {
139
+ headline: extractHeadline(article),
140
+ author: extractAuthor(article),
141
+ datePublished: extractPublishDate(article),
142
+ content: sanitizeText(article.textContent || article.innerText || '')
143
+ }
144
+ } catch (error) {
145
+ logger.warn('Error extracting article content:', error)
146
+ return null
147
+ }
148
+ }
149
+
150
+ /**
151
+ * Extract headline
152
+ * @private
153
+ * @param {Element} article - Article element
154
+ * @returns {string} Headline
155
+ */
156
+ function extractHeadline(article) {
157
+ const h1 = article.querySelector('h1')
158
+ const headline = article.querySelector('[itemprop="headline"]')
159
+ return sanitizeText((h1 || headline)?.textContent || '')
160
+ }
161
+
162
+ /**
163
+ * Extract author
164
+ * @private
165
+ * @param {Element} article - Article element
166
+ * @returns {string} Author
167
+ */
168
+ function extractAuthor(article) {
169
+ const author = article.querySelector('[itemprop="author"]') ||
170
+ article.querySelector('[rel="author"]') ||
171
+ article.querySelector('.author')
172
+ return sanitizeText(author?.textContent || '')
173
+ }
174
+
175
+ /**
176
+ * Extract publish date
177
+ * @private
178
+ * @param {Element} article - Article element
179
+ * @returns {string} Publish date
180
+ */
181
+ function extractPublishDate(article) {
182
+ const dateEl = article.querySelector('[itemprop="datePublished"]') ||
183
+ article.querySelector('time[datetime]')
184
+ return dateEl?.getAttribute('datetime') || dateEl?.textContent || ''
185
+ }
186
+
187
+ /**
188
+ * Check if current page is an article
189
+ * @returns {boolean}
190
+ */
191
+ export function isArticlePage() {
192
+ return document.querySelector('article') !== null ||
193
+ document.querySelector('[itemtype*="Article"]') !== null ||
194
+ document.querySelector('meta[property="og:type"][content="article"]') !== null
195
+ }
196
+
@@ -0,0 +1,228 @@
1
+ /**
2
+ * Mixpeek Context Adapter - Video Content Extractor
3
+ * @module extractors/videoExtractor
4
+ */
5
+
6
+ import { sanitizeText } from '../utils/helpers.js'
7
+ import logger from '../utils/logger.js'
8
+
9
+ /**
10
+ * Extract video content from the page
11
+ * @param {string} selector - CSS selector for video element
12
+ * @returns {object|null} Extracted video content
13
+ */
14
+ export function extractVideoContent(selector = 'video') {
15
+ logger.time('extractVideoContent')
16
+
17
+ try {
18
+ const videos = findVideoElements(selector)
19
+
20
+ if (videos.length === 0) {
21
+ logger.info('No video elements found')
22
+ logger.timeEnd('extractVideoContent')
23
+ return null
24
+ }
25
+
26
+ // Use the first (typically largest/main) video
27
+ const video = videos[0]
28
+ const content = {
29
+ src: getVideoSource(video),
30
+ poster: video.poster || '',
31
+ title: extractVideoTitle(video),
32
+ description: extractVideoDescription(video),
33
+ duration: video.duration || 0,
34
+ currentTime: video.currentTime || 0,
35
+ dimensions: {
36
+ width: video.videoWidth || video.width || 0,
37
+ height: video.videoHeight || video.height || 0
38
+ },
39
+ metadata: extractVideoMetadata(video)
40
+ }
41
+
42
+ logger.timeEnd('extractVideoContent')
43
+ logger.info('Extracted video content:', {
44
+ src: content.src,
45
+ title: content.title,
46
+ duration: content.duration
47
+ })
48
+
49
+ return content
50
+ } catch (error) {
51
+ logger.error('Error extracting video content:', error)
52
+ logger.timeEnd('extractVideoContent')
53
+ return null
54
+ }
55
+ }
56
+
57
+ /**
58
+ * Find video elements on the page
59
+ * @private
60
+ * @param {string} selector - CSS selector
61
+ * @returns {array} Array of video elements
62
+ */
63
+ function findVideoElements(selector) {
64
+ const videos = Array.from(document.querySelectorAll(selector))
65
+
66
+ // Sort by size (largest first)
67
+ return videos.sort((a, b) => {
68
+ const aSize = (a.videoWidth || a.width || 0) * (a.videoHeight || a.height || 0)
69
+ const bSize = (b.videoWidth || b.width || 0) * (b.videoHeight || b.height || 0)
70
+ return bSize - aSize
71
+ })
72
+ }
73
+
74
+ /**
75
+ * Get video source URL
76
+ * @private
77
+ * @param {HTMLVideoElement} video - Video element
78
+ * @returns {string} Video source URL
79
+ */
80
+ function getVideoSource(video) {
81
+ // Check src attribute
82
+ if (video.src) return video.src
83
+
84
+ // Check source elements
85
+ const source = video.querySelector('source')
86
+ if (source && source.src) return source.src
87
+
88
+ // Check currentSrc
89
+ if (video.currentSrc) return video.currentSrc
90
+
91
+ return ''
92
+ }
93
+
94
+ /**
95
+ * Extract video title
96
+ * @private
97
+ * @param {HTMLVideoElement} video - Video element
98
+ * @returns {string} Video title
99
+ */
100
+ function extractVideoTitle(video) {
101
+ // Check data attributes
102
+ const title = video.getAttribute('data-title') ||
103
+ video.getAttribute('title') ||
104
+ video.getAttribute('aria-label')
105
+
106
+ if (title) return sanitizeText(title)
107
+
108
+ // Check parent container
109
+ const container = video.closest('[data-video-title]')
110
+ if (container) {
111
+ return sanitizeText(container.getAttribute('data-video-title'))
112
+ }
113
+
114
+ // Check nearby heading
115
+ const heading = video.previousElementSibling?.querySelector('h1, h2, h3') ||
116
+ video.parentElement?.querySelector('h1, h2, h3')
117
+ if (heading) {
118
+ return sanitizeText(heading.textContent)
119
+ }
120
+
121
+ return ''
122
+ }
123
+
124
+ /**
125
+ * Extract video description
126
+ * @private
127
+ * @param {HTMLVideoElement} video - Video element
128
+ * @returns {string} Video description
129
+ */
130
+ function extractVideoDescription(video) {
131
+ const desc = video.getAttribute('data-description') ||
132
+ video.getAttribute('aria-description')
133
+
134
+ if (desc) return sanitizeText(desc)
135
+
136
+ // Check parent container
137
+ const container = video.closest('[data-video-description]')
138
+ if (container) {
139
+ return sanitizeText(container.getAttribute('data-video-description'))
140
+ }
141
+
142
+ return ''
143
+ }
144
+
145
+ /**
146
+ * Extract video metadata
147
+ * @private
148
+ * @param {HTMLVideoElement} video - Video element
149
+ * @returns {object} Video metadata
150
+ */
151
+ function extractVideoMetadata(video) {
152
+ const metadata = {}
153
+
154
+ // Extract all data attributes
155
+ Array.from(video.attributes).forEach(attr => {
156
+ if (attr.name.startsWith('data-')) {
157
+ const key = attr.name.replace('data-', '').replace(/-/g, '_')
158
+ metadata[key] = attr.value
159
+ }
160
+ })
161
+
162
+ return metadata
163
+ }
164
+
165
+ /**
166
+ * Capture video frame as base64 image
167
+ * @param {HTMLVideoElement} video - Video element
168
+ * @param {number} quality - JPEG quality (0-1)
169
+ * @returns {string|null} Base64 encoded image
170
+ */
171
+ export function captureVideoFrame(video, quality = 0.8) {
172
+ try {
173
+ const canvas = document.createElement('canvas')
174
+ canvas.width = video.videoWidth || video.width || 640
175
+ canvas.height = video.videoHeight || video.height || 360
176
+
177
+ const ctx = canvas.getContext('2d')
178
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height)
179
+
180
+ return canvas.toDataURL('image/jpeg', quality)
181
+ } catch (error) {
182
+ logger.warn('Error capturing video frame:', error)
183
+ return null
184
+ }
185
+ }
186
+
187
+ /**
188
+ * Extract video player information (YouTube, Vimeo, etc.)
189
+ * @returns {object|null} Video player info
190
+ */
191
+ export function extractVideoPlayerInfo() {
192
+ // YouTube
193
+ const ytPlayer = document.querySelector('iframe[src*="youtube.com"]')
194
+ if (ytPlayer) {
195
+ const src = ytPlayer.src
196
+ const videoIdMatch = src.match(/embed\/([^?]+)/)
197
+ return {
198
+ platform: 'youtube',
199
+ videoId: videoIdMatch ? videoIdMatch[1] : '',
200
+ src
201
+ }
202
+ }
203
+
204
+ // Vimeo
205
+ const vimeoPlayer = document.querySelector('iframe[src*="vimeo.com"]')
206
+ if (vimeoPlayer) {
207
+ const src = vimeoPlayer.src
208
+ const videoIdMatch = src.match(/video\/(\d+)/)
209
+ return {
210
+ platform: 'vimeo',
211
+ videoId: videoIdMatch ? videoIdMatch[1] : '',
212
+ src
213
+ }
214
+ }
215
+
216
+ return null
217
+ }
218
+
219
+ /**
220
+ * Check if page has video content
221
+ * @returns {boolean}
222
+ */
223
+ export function hasVideo() {
224
+ return document.querySelector('video') !== null ||
225
+ document.querySelector('iframe[src*="youtube.com"]') !== null ||
226
+ document.querySelector('iframe[src*="vimeo.com"]') !== null
227
+ }
228
+