coelacanth 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,552 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+ require "strscan"
5
+
6
+ module Coelacanth
7
+ class Extractor
8
+ # Scores candidate morphemes extracted from article content. The
9
+ # implementation follows a light-weight heuristic approach that approximates
10
+ # the specification shared in the user instructions. It prioritises noun-ish
11
+ # phrases for both Japanese and English text, applies positional boosts, and
12
+ # returns the highest scoring terms.
13
+ class MorphologicalAnalyzer
14
+ TOKEN_PATTERN = /
15
+ \p{Han}+ | # Kanji sequences
16
+ \p{Hiragana}+ | # Hiragana sequences
17
+ [\p{Katakana}ー]+ | # Katakana sequences including the choonpu
18
+ [A-Za-z0-9]+(?:-[A-Za-z0-9]+)* # Latin alphanumerics keeping inner hyphen
19
+ /x.freeze
20
+
21
+ MARKDOWN_CONTROL_PATTERN = /[`*_>#\[\]\(\)\{\}!\+=|~]/.freeze
22
+
23
+ EN_STOPWORDS = %w[
24
+ a an and are as at be but by for if in into is it its of on or such
25
+ that the their then there these they this to was were will with
26
+ ].freeze
27
+
28
+ EN_GENERIC_TERMS = %w[
29
+ page pages article articles category categories tag tags image images
30
+ video videos click home link links read more author authors post posts
31
+ ].freeze
32
+
33
+ JA_GENERIC_TERMS = %w[カテゴリ カテゴリー 記事 画像 写真 まとめ サイト 投稿 ブログ 最新 人気 関連].freeze
34
+
35
+ FULLWIDTH_ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".freeze
36
+ HALF_WIDTH_ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".freeze
37
+ FULLWIDTH_DIGITS = "0123456789".freeze
38
+ HALF_WIDTH_DIGITS = "0123456789".freeze
39
+ FULLWIDTH_HYPHENS = "-―ーー".freeze
40
+
41
+ TOP_K = 8
42
+ MAX_POSITION_BOOST = 3.0
43
+ LENGTH_BONUS_FACTOR = 0.15
44
+ MAX_LENGTH_BONUS = 1.6
45
+
46
+ POSITION_WEIGHTS = {
47
+ body: 1.0,
48
+ title: 2.2,
49
+ h1: 1.6,
50
+ h2: 1.3,
51
+ accent: 1.1
52
+ }.freeze
53
+
54
+ CATEGORY_ALIASES = {
55
+ "kanji" => :kanji,
56
+ "hiragana" => :hiragana,
57
+ "katakana" => :katakana,
58
+ "latin" => :latin
59
+ }.freeze
60
+
61
+ Term = Struct.new(:key, :token, :components, :language, keyword_init: true)
62
+
63
+ def initialize(config: Coelacanth.config)
64
+ @config = config
65
+ end
66
+
67
+ def call(node:, title:, markdown:)
68
+ stats = Hash.new do |hash, key|
69
+ hash[key] = {
70
+ token: nil,
71
+ components: 1,
72
+ body_count: 0,
73
+ pos_bonus: 0.0,
74
+ language: nil
75
+ }
76
+ end
77
+
78
+ body_terms = extract_terms(markdown)
79
+
80
+ contexts = [
81
+ [POSITION_WEIGHTS[:title], extract_terms(title)],
82
+ [POSITION_WEIGHTS[:h1], extract_terms(text_for(node, "h1"))],
83
+ [POSITION_WEIGHTS[:h2], extract_terms(text_for(node, "h2"))],
84
+ [
85
+ POSITION_WEIGHTS[:accent],
86
+ extract_terms(
87
+ [
88
+ text_for(node, "a"),
89
+ text_for(node, "strong"),
90
+ text_for(node, "b"),
91
+ text_for(node, "img", attribute: "alt")
92
+ ].compact.join(" ")
93
+ )
94
+ ],
95
+ [POSITION_WEIGHTS[:body], body_terms]
96
+ ]
97
+
98
+ contexts.each do |weight, terms|
99
+ next if terms.empty?
100
+
101
+ grouped = terms.group_by(&:key)
102
+ grouped.each_value do |occurrences|
103
+ representative = occurrences.max_by(&:components)
104
+ entry = stats[representative.key]
105
+ entry[:token] ||= representative.token
106
+ entry[:components] = [entry[:components], representative.components].max
107
+ entry[:language] ||= representative.language
108
+
109
+ bonus = weight - 1.0
110
+ entry[:pos_bonus] += bonus if bonus.positive?
111
+ end
112
+ end
113
+
114
+ body_terms.each do |term|
115
+ entry = stats[term.key]
116
+ entry[:token] ||= term.token
117
+ entry[:components] = [entry[:components], term.components].max
118
+ entry[:language] ||= term.language
119
+ entry[:body_count] += 1
120
+ end
121
+
122
+ scored = stats.values.map do |entry|
123
+ next if entry[:body_count].zero?
124
+
125
+ tf = Math.log(entry[:body_count] + 1.0)
126
+ pos_boost = [1.0 + entry[:pos_bonus], MAX_POSITION_BOOST].min
127
+ len_bonus = [1.0 + LENGTH_BONUS_FACTOR * (entry[:components] - 1), MAX_LENGTH_BONUS].min
128
+ score = tf * pos_boost * len_bonus
129
+
130
+ entry.merge(score: score)
131
+ end.compact
132
+
133
+ return [] if scored.empty?
134
+
135
+ sorted = scored.sort_by { |entry| [-entry[:score], entry[:token]] }
136
+ pruned = prune_inclusions(sorted)
137
+ max_score = pruned.first[:score]
138
+ threshold = max_score * 0.35
139
+
140
+ selected = pruned.select { |entry| entry[:score] >= threshold }
141
+
142
+ if selected.length < TOP_K
143
+ pruned.each do |entry|
144
+ next if selected.include?(entry)
145
+
146
+ selected << entry
147
+ break if selected.length >= TOP_K
148
+ end
149
+ end
150
+
151
+ selected = selected.take(TOP_K)
152
+
153
+ selected.map do |entry|
154
+ {
155
+ token: entry[:token],
156
+ score: entry[:score],
157
+ count: entry[:body_count]
158
+ }
159
+ end
160
+ end
161
+
162
+ private
163
+
164
+ def extract_terms(text)
165
+ sanitized = sanitize_text(text)
166
+ return [] if sanitized.empty?
167
+
168
+ tokens = tokenize(sanitized)
169
+ build_terms(tokens)
170
+ end
171
+
172
+ def sanitize_text(text)
173
+ sanitized = text.to_s
174
+ return "" if sanitized.empty?
175
+
176
+ sanitized = sanitized.gsub(MARKDOWN_CONTROL_PATTERN, " ")
177
+ sanitized.gsub(/^[ \t]*[-+*]\s+/, " ")
178
+ end
179
+
180
+ def tokenize(text)
181
+ scanner = StringScanner.new(text)
182
+ tokens = []
183
+ gap_buffer = String.new
184
+ until scanner.eos?
185
+ if (whitespace = scanner.scan(/\s+/))
186
+ gap_buffer << whitespace
187
+ next
188
+ elsif (raw = scanner.scan(TOKEN_PATTERN))
189
+ end_pos = scanner.pos
190
+ start_pos = end_pos - raw.length
191
+ category = detect_category(raw)
192
+ normalized = normalize_token(raw, category)
193
+ tokens << {
194
+ raw: raw,
195
+ normalized: normalized,
196
+ category: category,
197
+ start: start_pos,
198
+ end: end_pos,
199
+ gap: gap_buffer
200
+ }
201
+ gap_buffer = String.new
202
+ else
203
+ gap_buffer << scanner.getch
204
+ end
205
+ end
206
+
207
+ tokens
208
+ end
209
+
210
+ def build_terms(tokens)
211
+ terms = []
212
+ index = 0
213
+
214
+ while index < tokens.length
215
+ token = tokens[index]
216
+
217
+ case token[:category]
218
+ when :latin
219
+ sequences, index = consume_latin_sequences(tokens, index)
220
+ sequences.each do |sequence|
221
+ joined = join_latin_sequence(sequence)
222
+ normalized = joined[:normalized]
223
+ components = sequence.length
224
+
225
+ next unless valid_english_term?(normalized)
226
+
227
+ terms << Term.new(key: normalized, token: normalized, components: components, language: :en)
228
+ end
229
+ when :kanji, :katakana
230
+ sequence, index = consume_japanese_sequence(tokens, index)
231
+ next if sequence.empty?
232
+
233
+ normalized = sequence.map { |component| component[:normalized] }.join
234
+ components = sequence.length
235
+
236
+ next unless valid_japanese_term?(normalized)
237
+
238
+ terms << Term.new(key: normalized, token: normalized, components: components, language: :ja)
239
+ else
240
+ index += 1
241
+ end
242
+ end
243
+
244
+ terms
245
+ end
246
+
247
+ def consume_latin_sequences(tokens, index)
248
+ run = []
249
+ pointer = index
250
+
251
+ while pointer < tokens.length
252
+ token = tokens[pointer]
253
+ break unless token[:category] == :latin
254
+
255
+ run << token
256
+ pointer += 1
257
+
258
+ break unless pointer < tokens.length
259
+
260
+ next_token = tokens[pointer]
261
+ break unless next_token[:category] == :latin && joinable_gap?(next_token[:gap], :latin)
262
+ end
263
+
264
+ sequences = split_latin_run(run)
265
+ [sequences, pointer]
266
+ end
267
+
268
+ def split_latin_run(run)
269
+ sequences = []
270
+ current = []
271
+
272
+ run.each do |token|
273
+ if english_stopword?(token[:normalized])
274
+ if current.any?
275
+ sequences << current
276
+ current = []
277
+ end
278
+ else
279
+ current << token
280
+ end
281
+ end
282
+
283
+ sequences << current if current.any?
284
+ sequences
285
+ end
286
+
287
+ def consume_japanese_sequence(tokens, index)
288
+ sequence = []
289
+ pointer = index
290
+
291
+ while pointer < tokens.length
292
+ token = tokens[pointer]
293
+ unless japanese_noun_token?(token) || (sequence.any? && hiragana_suffix?(sequence.last, token))
294
+ break
295
+ end
296
+
297
+ sequence << token
298
+ pointer += 1
299
+
300
+ break unless pointer < tokens.length
301
+
302
+ next_token = tokens[pointer]
303
+ break unless japanese_sequence_continues?(token, next_token)
304
+ end
305
+
306
+ [sequence, pointer]
307
+ end
308
+
309
+ def japanese_sequence_continues?(current, following)
310
+ return false if japanese_category_break?(current, following)
311
+
312
+ return false unless japanese_noun_token?(following) ||
313
+ (following[:category] == :latin && joinable_gap?(following[:gap], :latin)) ||
314
+ hiragana_suffix?(current, following)
315
+
316
+ gap = following[:gap]
317
+ return true if gap.empty?
318
+
319
+ gap.strip.empty?
320
+ end
321
+
322
+ def japanese_noun_token?(token)
323
+ [:kanji, :katakana].include?(token[:category])
324
+ end
325
+
326
+ def hiragana_suffix?(current, following)
327
+ return false unless current[:category] == :kanji
328
+ return false unless following[:category] == :hiragana
329
+ return false unless following[:gap].empty?
330
+
331
+ suffixes = configured_hiragana_suffixes
332
+ return true if suffixes.nil?
333
+
334
+ suffixes.include?(following[:raw])
335
+ end
336
+
337
+ def whitespace_gap?(gap)
338
+ gap.strip.empty?
339
+ end
340
+
341
+ def joinable_gap?(gap, category)
342
+ return true if whitespace_gap?(gap)
343
+
344
+ case category
345
+ when :latin
346
+ connector_gap?(gap)
347
+ else
348
+ false
349
+ end
350
+ end
351
+
352
+ def connector_gap?(gap)
353
+ return false if gap.nil?
354
+
355
+ stripped = gap.delete("\s")
356
+ return false if stripped.empty?
357
+
358
+ stripped.chars.all? { |char| latin_joiners.include?(char) }
359
+ end
360
+
361
+ def normalize_token(token, category)
362
+ normalized = token
363
+ .tr(FULLWIDTH_ALPHA, HALF_WIDTH_ALPHA)
364
+ .tr(FULLWIDTH_DIGITS, HALF_WIDTH_DIGITS)
365
+ .tr(FULLWIDTH_HYPHENS, "-")
366
+ .downcase
367
+
368
+ normalized = lemmatize_latin(normalized) if category == :latin
369
+ normalized
370
+ end
371
+
372
+ def detect_category(token)
373
+ return :kanji if token.match?(/\A\p{Han}+\z/)
374
+ return :hiragana if token.match?(/\A\p{Hiragana}+\z/)
375
+ return :katakana if token.match?(/\A[\p{Katakana}ー]+\z/)
376
+
377
+ :latin
378
+ end
379
+
380
+ def lemmatize_latin(token)
381
+ return token if token.length <= 3
382
+ return token if token.include?("-")
383
+ return token if token.match?(/\d/)
384
+ return token if token.end_with?("ss") || token.end_with?("us") || token.end_with?("is")
385
+
386
+ if token.end_with?("ies") && token.length > 3
387
+ token[0...-3] + "y"
388
+ elsif token.end_with?("es") && !token.end_with?("ses") && token.length > 3
389
+ token[0...-2]
390
+ elsif token.end_with?("s")
391
+ token[0...-1]
392
+ else
393
+ token
394
+ end
395
+ end
396
+
397
+ def valid_english_term?(normalized)
398
+ return false if normalized.empty?
399
+ return false if normalized.match?(/\A\d+\z/)
400
+
401
+ words = normalized.split(/\s+/)
402
+ return false if words.length == 1 && words.first.length <= 2
403
+
404
+ return false if EN_GENERIC_TERMS.include?(normalized)
405
+
406
+ true
407
+ end
408
+
409
+ def english_stopword?(word)
410
+ EN_STOPWORDS.include?(word)
411
+ end
412
+
413
+ def valid_japanese_term?(normalized)
414
+ return false if normalized.empty?
415
+ return false if normalized.length == 1
416
+ return false if normalized.match?(/\A[ぁ-ゖゝゞー]+\z/)
417
+ return false if normalized.match?(/\A\d+\z/)
418
+ return false if JA_GENERIC_TERMS.include?(normalized)
419
+
420
+ true
421
+ end
422
+
423
+ def prune_inclusions(entries)
424
+ accepted = []
425
+
426
+ entries.each do |entry|
427
+ next if accepted.any? { |chosen| contains_term?(chosen, entry) }
428
+
429
+ accepted << entry
430
+ end
431
+
432
+ accepted
433
+ end
434
+
435
+ def contains_term?(long_entry, short_entry)
436
+ return false if long_entry.equal?(short_entry)
437
+ return false if long_entry[:language] != short_entry[:language]
438
+ return false if long_entry[:token] == short_entry[:token]
439
+
440
+ case long_entry[:language]
441
+ when :en
442
+ long_words = long_entry[:token].split(/\s+/)
443
+ short_words = short_entry[:token].split(/\s+/)
444
+ return false if short_words.length > long_words.length
445
+
446
+ long_words.each_cons(short_words.length) do |slice|
447
+ return true if slice == short_words
448
+ end
449
+
450
+ false
451
+ when :ja
452
+ long_entry[:token].include?(short_entry[:token])
453
+ else
454
+ false
455
+ end
456
+ end
457
+
458
+ def text_for(node, selector, attribute: nil)
459
+ return "" unless node
460
+
461
+ elements = node.css(selector)
462
+ return "" if elements.empty?
463
+
464
+ texts = elements.map do |element|
465
+ if attribute
466
+ element[attribute].to_s
467
+ else
468
+ element.text
469
+ end
470
+ end
471
+
472
+ texts.join(" ")
473
+ end
474
+
475
+ def join_latin_sequence(sequence)
476
+ token_builder = String.new
477
+ normalized_builder = String.new
478
+
479
+ sequence.each_with_index do |component, index|
480
+ if index.zero?
481
+ token_builder << component[:raw]
482
+ normalized_builder << component[:normalized]
483
+ next
484
+ end
485
+
486
+ gap = component[:gap]
487
+
488
+ if connector_gap?(gap)
489
+ token_builder << gap
490
+ normalized_builder << gap_connector_representation(gap)
491
+ else
492
+ token_builder << " "
493
+ normalized_builder << " "
494
+ end
495
+
496
+ token_builder << component[:raw]
497
+ normalized_builder << component[:normalized]
498
+ end
499
+
500
+ { token: token_builder, normalized: normalized_builder }
501
+ end
502
+
503
+ def gap_connector_representation(gap)
504
+ stripped = gap.delete("\s")
505
+ return gap if stripped.empty?
506
+
507
+ stripped + (gap.match?(/\s+$/) ? " " : "")
508
+ end
509
+
510
+ def latin_joiners
511
+ @latin_joiners ||= Array(config_read("morphology.latin_joiners")).map(&:to_s)
512
+ end
513
+
514
+ def configured_hiragana_suffixes
515
+ @configured_hiragana_suffixes ||= begin
516
+ suffixes = config_read("morphology.japanese_hiragana_suffixes")
517
+ suffixes&.map(&:to_s)
518
+ end
519
+ end
520
+
521
+ def configured_japanese_category_breaks
522
+ @configured_japanese_category_breaks ||= begin
523
+ entries = Array(config_read("morphology.japanese_category_breaks"))
524
+ entries.each_with_object(Set.new) do |entry, set|
525
+ next unless entry
526
+
527
+ from, to = entry.to_s.split(/_to_/, 2)
528
+ from_category = CATEGORY_ALIASES[from]
529
+ to_category = CATEGORY_ALIASES[to]
530
+
531
+ set << [from_category, to_category] if from_category && to_category
532
+ end
533
+ end
534
+ end
535
+
536
+ def japanese_category_break?(current, following)
537
+ breaks = configured_japanese_category_breaks
538
+ return false if breaks.empty?
539
+
540
+ breaks.include?([current[:category], following[:category]])
541
+ end
542
+
543
+ def config_read(key)
544
+ return nil unless @config
545
+
546
+ @config.read(key)
547
+ rescue NoMethodError
548
+ nil
549
+ end
550
+ end
551
+ end
552
+ end
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "json"
5
+ require "net/http"
6
+ require "uri"
7
+
8
+ require_relative "../http"
9
+
10
+ module Coelacanth
11
+ class Extractor
12
+ # Applies pre-processing steps before running the main extraction pipeline.
13
+ class Preprocessor
14
+ def initialize(preprocessors: default_preprocessors)
15
+ @preprocessors = preprocessors
16
+ end
17
+
18
+ def call(html:, url: nil)
19
+ return html if url.nil?
20
+
21
+ @preprocessors.each do |preprocessor|
22
+ processed = preprocessor.call(html: html, url: url)
23
+ return processed if processed
24
+ end
25
+
26
+ html
27
+ end
28
+
29
+ private
30
+
31
+ def default_preprocessors
32
+ [Preprocessors::YouTube.new]
33
+ end
34
+
35
+ module Preprocessors
36
+ # Converts YouTube video pages into structured article-like HTML using the
37
+ # YouTube Data API. This allows the downstream extractor to consume the
38
+ # video description and thumbnail as if it were a standard article.
39
+ class YouTube
40
+ API_ENDPOINT = "https://www.googleapis.com/youtube/v3/videos"
41
+ WATCH_HOSTS = %w[youtube.com www.youtube.com youtu.be m.youtube.com music.youtube.com].freeze
42
+
43
+ def call(html:, url:)
44
+ video_id = extract_video_id(url)
45
+ return unless video_id
46
+
47
+ api_key = youtube_api_key
48
+ return if api_key.nil? || api_key.empty?
49
+
50
+ snippet = fetch_snippet(video_id, api_key)
51
+ return unless snippet
52
+
53
+ build_document(snippet)
54
+ end
55
+
56
+ private
57
+
58
+ def extract_video_id(url)
59
+ uri = URI.parse(url)
60
+ return unless WATCH_HOSTS.include?(uri.host)
61
+
62
+ if uri.host == "youtu.be"
63
+ uri.path.split("/").reject(&:empty?).first
64
+ else
65
+ params = URI.decode_www_form(uri.query.to_s).to_h
66
+ params["v"].to_s.strip
67
+ end
68
+ rescue URI::InvalidURIError
69
+ nil
70
+ end
71
+
72
+ def youtube_api_key
73
+ Coelacanth.config.read("youtube.api_key").to_s.strip
74
+ rescue StandardError
75
+ ""
76
+ end
77
+
78
+ def fetch_snippet(video_id, api_key)
79
+ uri = URI(API_ENDPOINT)
80
+ uri.query = URI.encode_www_form(part: "snippet", id: video_id, key: api_key)
81
+
82
+ response = Coelacanth::HTTP.raw_get_response(uri)
83
+ return unless response.is_a?(Net::HTTPSuccess)
84
+
85
+ payload = JSON.parse(response.body.to_s)
86
+ payload.fetch("items", []).first&.fetch("snippet", nil)
87
+ rescue Coelacanth::TimeoutError, JSON::ParserError, StandardError
88
+ nil
89
+ end
90
+
91
+ def build_document(snippet)
92
+ title = snippet["title"].to_s.strip
93
+ description = snippet["description"].to_s
94
+ published_at = snippet["publishedAt"].to_s.strip
95
+ thumbnail_url = preferred_thumbnail(snippet["thumbnails"])
96
+
97
+ body_html = render_description(description)
98
+ thumbnail_markup = render_thumbnail(thumbnail_url, title)
99
+ article_html = "#{thumbnail_markup}#{body_html}"
100
+
101
+ jsonld = {
102
+ "@context" => "https://schema.org",
103
+ "@type" => "Article",
104
+ "headline" => title,
105
+ "datePublished" => published_at,
106
+ "articleBody" => article_html
107
+ }.to_json
108
+
109
+ <<~HTML
110
+ <html data-preprocessor="youtube">
111
+ <head>
112
+ <title>#{escape_html(title)}</title>
113
+ <meta property="article:published_time" content="#{escape_html(published_at)}" />
114
+ <meta property="og:image" content="#{escape_html(thumbnail_url)}" />
115
+ <script type="application/ld+json">#{jsonld}</script>
116
+ </head>
117
+ <body>
118
+ <article>
119
+ <h1>#{escape_html(title)}</h1>
120
+ #{article_html}
121
+ </article>
122
+ </body>
123
+ </html>
124
+ HTML
125
+ end
126
+
127
+ def render_description(description)
128
+ blocks = description.split(/\r?\n{2,}/).map(&:strip).reject(&:empty?)
129
+ return "<p></p>" if blocks.empty?
130
+
131
+ blocks.map do |block|
132
+ lines = block.split(/\r?\n/).map { |line| escape_html(line) }
133
+ "<p>#{lines.join('<br />')}</p>"
134
+ end.join
135
+ end
136
+
137
+ def render_thumbnail(thumbnail_url, title)
138
+ return "" if thumbnail_url.to_s.strip.empty?
139
+
140
+ <<~HTML
141
+ <figure>
142
+ <img src="#{escape_html(thumbnail_url)}" alt="#{escape_html(title)} thumbnail" />
143
+ </figure>
144
+ HTML
145
+ end
146
+
147
+ def preferred_thumbnail(thumbnails)
148
+ return "" unless thumbnails.is_a?(Hash)
149
+
150
+ %w[maxres standard high medium default].each do |size|
151
+ url = thumbnails.dig(size, "url").to_s.strip
152
+ return url unless url.empty?
153
+ end
154
+
155
+ ""
156
+ end
157
+
158
+ def escape_html(value)
159
+ CGI.escapeHTML(value.to_s)
160
+ end
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
166
+