coelacanth 0.4.3 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,556 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+ require "strscan"
5
+
6
+ module Coelacanth
7
+ class Extractor
8
+ # Scores candidate morphemes extracted from article content. The
9
+ # implementation follows a light-weight heuristic approach that approximates
10
+ # the specification shared in the user instructions. It prioritises noun-ish
11
+ # phrases for both Japanese and English text, applies positional boosts, and
12
+ # returns the highest scoring terms.
13
+ class MorphologicalAnalyzer
14
+ TOKEN_PATTERN = /
15
+ \p{Han}+ | # Kanji sequences
16
+ \p{Hiragana}+ | # Hiragana sequences
17
+ [\p{Katakana}ー]+ | # Katakana sequences including the choonpu
18
+ [A-Za-z0-9]+(?:-[A-Za-z0-9]+)* # Latin alphanumerics keeping inner hyphen
19
+ /x.freeze
20
+
21
+ MARKDOWN_CONTROL_PATTERN = /[`*_>#\[\]\(\)\{\}!\+=|~]/.freeze
22
+
23
+ EN_STOPWORDS = %w[
24
+ a an and are as at be but by for if in into is it its of on or such
25
+ that the their then there these they this to was were will with
26
+ ].freeze
27
+
28
+ EN_GENERIC_TERMS = %w[
29
+ page pages article articles category categories tag tags image images
30
+ video videos click home link links read more author authors post posts
31
+ ].freeze
32
+
33
+ JA_GENERIC_TERMS = %w[カテゴリ カテゴリー 記事 画像 写真 まとめ サイト 投稿 ブログ 最新 人気 関連].freeze
34
+
35
+ FULLWIDTH_ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".freeze
36
+ HALF_WIDTH_ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".freeze
37
+ FULLWIDTH_DIGITS = "0123456789".freeze
38
+ HALF_WIDTH_DIGITS = "0123456789".freeze
39
+ FULLWIDTH_HYPHENS = "-―ーー".freeze
40
+
41
+ TOP_K = 8
42
+ MAX_POSITION_BOOST = 3.0
43
+ LENGTH_BONUS_FACTOR = 0.15
44
+ MAX_LENGTH_BONUS = 1.6
45
+
46
+ POSITION_WEIGHTS = {
47
+ body: 1.0,
48
+ title: 2.2,
49
+ h1: 1.6,
50
+ h2: 1.3,
51
+ accent: 1.1
52
+ }.freeze
53
+
54
+ CATEGORY_ALIASES = {
55
+ "kanji" => :kanji,
56
+ "hiragana" => :hiragana,
57
+ "katakana" => :katakana,
58
+ "latin" => :latin
59
+ }.freeze
60
+
61
+ Term = Struct.new(:key, :token, :components, :language, keyword_init: true)
62
+
63
+ def initialize(config: Coelacanth.config)
64
+ @config = config
65
+ end
66
+
67
+ def call_text(text, title: nil)
68
+ call(node: nil, title: title, markdown: text)
69
+ end
70
+
71
+ def call(node:, title:, markdown:)
72
+ stats = Hash.new do |hash, key|
73
+ hash[key] = {
74
+ token: nil,
75
+ components: 1,
76
+ body_count: 0,
77
+ pos_bonus: 0.0,
78
+ language: nil
79
+ }
80
+ end
81
+
82
+ body_terms = extract_terms(markdown)
83
+
84
+ contexts = [
85
+ [POSITION_WEIGHTS[:title], extract_terms(title)],
86
+ [POSITION_WEIGHTS[:h1], extract_terms(text_for(node, "h1"))],
87
+ [POSITION_WEIGHTS[:h2], extract_terms(text_for(node, "h2"))],
88
+ [
89
+ POSITION_WEIGHTS[:accent],
90
+ extract_terms(
91
+ [
92
+ text_for(node, "a"),
93
+ text_for(node, "strong"),
94
+ text_for(node, "b"),
95
+ text_for(node, "img", attribute: "alt")
96
+ ].compact.join(" ")
97
+ )
98
+ ],
99
+ [POSITION_WEIGHTS[:body], body_terms]
100
+ ]
101
+
102
+ contexts.each do |weight, terms|
103
+ next if terms.empty?
104
+
105
+ grouped = terms.group_by(&:key)
106
+ grouped.each_value do |occurrences|
107
+ representative = occurrences.max_by(&:components)
108
+ entry = stats[representative.key]
109
+ entry[:token] ||= representative.token
110
+ entry[:components] = [entry[:components], representative.components].max
111
+ entry[:language] ||= representative.language
112
+
113
+ bonus = weight - 1.0
114
+ entry[:pos_bonus] += bonus if bonus.positive?
115
+ end
116
+ end
117
+
118
+ body_terms.each do |term|
119
+ entry = stats[term.key]
120
+ entry[:token] ||= term.token
121
+ entry[:components] = [entry[:components], term.components].max
122
+ entry[:language] ||= term.language
123
+ entry[:body_count] += 1
124
+ end
125
+
126
+ scored = stats.values.map do |entry|
127
+ next if entry[:body_count].zero?
128
+
129
+ tf = Math.log(entry[:body_count] + 1.0)
130
+ pos_boost = [1.0 + entry[:pos_bonus], MAX_POSITION_BOOST].min
131
+ len_bonus = [1.0 + LENGTH_BONUS_FACTOR * (entry[:components] - 1), MAX_LENGTH_BONUS].min
132
+ score = tf * pos_boost * len_bonus
133
+
134
+ entry.merge(score: score)
135
+ end.compact
136
+
137
+ return [] if scored.empty?
138
+
139
+ sorted = scored.sort_by { |entry| [-entry[:score], entry[:token]] }
140
+ pruned = prune_inclusions(sorted)
141
+ max_score = pruned.first[:score]
142
+ threshold = max_score * 0.35
143
+
144
+ selected = pruned.select { |entry| entry[:score] >= threshold }
145
+
146
+ if selected.length < TOP_K
147
+ pruned.each do |entry|
148
+ next if selected.include?(entry)
149
+
150
+ selected << entry
151
+ break if selected.length >= TOP_K
152
+ end
153
+ end
154
+
155
+ selected = selected.take(TOP_K)
156
+
157
+ selected.map do |entry|
158
+ {
159
+ token: entry[:token],
160
+ score: entry[:score],
161
+ count: entry[:body_count]
162
+ }
163
+ end
164
+ end
165
+
166
+ private
167
+
168
+ def extract_terms(text)
169
+ sanitized = sanitize_text(text)
170
+ return [] if sanitized.empty?
171
+
172
+ tokens = tokenize(sanitized)
173
+ build_terms(tokens)
174
+ end
175
+
176
+ def sanitize_text(text)
177
+ sanitized = text.to_s
178
+ return "" if sanitized.empty?
179
+
180
+ sanitized = sanitized.gsub(MARKDOWN_CONTROL_PATTERN, " ")
181
+ sanitized.gsub(/^[ \t]*[-+*]\s+/, " ")
182
+ end
183
+
184
+ def tokenize(text)
185
+ scanner = StringScanner.new(text)
186
+ tokens = []
187
+ gap_buffer = String.new
188
+ until scanner.eos?
189
+ if (whitespace = scanner.scan(/\s+/))
190
+ gap_buffer << whitespace
191
+ next
192
+ elsif (raw = scanner.scan(TOKEN_PATTERN))
193
+ end_pos = scanner.pos
194
+ start_pos = end_pos - raw.length
195
+ category = detect_category(raw)
196
+ normalized = normalize_token(raw, category)
197
+ tokens << {
198
+ raw: raw,
199
+ normalized: normalized,
200
+ category: category,
201
+ start: start_pos,
202
+ end: end_pos,
203
+ gap: gap_buffer
204
+ }
205
+ gap_buffer = String.new
206
+ else
207
+ gap_buffer << scanner.getch
208
+ end
209
+ end
210
+
211
+ tokens
212
+ end
213
+
214
+ def build_terms(tokens)
215
+ terms = []
216
+ index = 0
217
+
218
+ while index < tokens.length
219
+ token = tokens[index]
220
+
221
+ case token[:category]
222
+ when :latin
223
+ sequences, index = consume_latin_sequences(tokens, index)
224
+ sequences.each do |sequence|
225
+ joined = join_latin_sequence(sequence)
226
+ normalized = joined[:normalized]
227
+ components = sequence.length
228
+
229
+ next unless valid_english_term?(normalized)
230
+
231
+ terms << Term.new(key: normalized, token: normalized, components: components, language: :en)
232
+ end
233
+ when :kanji, :katakana
234
+ sequence, index = consume_japanese_sequence(tokens, index)
235
+ next if sequence.empty?
236
+
237
+ normalized = sequence.map { |component| component[:normalized] }.join
238
+ components = sequence.length
239
+
240
+ next unless valid_japanese_term?(normalized)
241
+
242
+ terms << Term.new(key: normalized, token: normalized, components: components, language: :ja)
243
+ else
244
+ index += 1
245
+ end
246
+ end
247
+
248
+ terms
249
+ end
250
+
251
+ def consume_latin_sequences(tokens, index)
252
+ run = []
253
+ pointer = index
254
+
255
+ while pointer < tokens.length
256
+ token = tokens[pointer]
257
+ break unless token[:category] == :latin
258
+
259
+ run << token
260
+ pointer += 1
261
+
262
+ break unless pointer < tokens.length
263
+
264
+ next_token = tokens[pointer]
265
+ break unless next_token[:category] == :latin && joinable_gap?(next_token[:gap], :latin)
266
+ end
267
+
268
+ sequences = split_latin_run(run)
269
+ [sequences, pointer]
270
+ end
271
+
272
+ def split_latin_run(run)
273
+ sequences = []
274
+ current = []
275
+
276
+ run.each do |token|
277
+ if english_stopword?(token[:normalized])
278
+ if current.any?
279
+ sequences << current
280
+ current = []
281
+ end
282
+ else
283
+ current << token
284
+ end
285
+ end
286
+
287
+ sequences << current if current.any?
288
+ sequences
289
+ end
290
+
291
+ def consume_japanese_sequence(tokens, index)
292
+ sequence = []
293
+ pointer = index
294
+
295
+ while pointer < tokens.length
296
+ token = tokens[pointer]
297
+ unless japanese_noun_token?(token) || (sequence.any? && hiragana_suffix?(sequence.last, token))
298
+ break
299
+ end
300
+
301
+ sequence << token
302
+ pointer += 1
303
+
304
+ break unless pointer < tokens.length
305
+
306
+ next_token = tokens[pointer]
307
+ break unless japanese_sequence_continues?(token, next_token)
308
+ end
309
+
310
+ [sequence, pointer]
311
+ end
312
+
313
+ def japanese_sequence_continues?(current, following)
314
+ return false if japanese_category_break?(current, following)
315
+
316
+ return false unless japanese_noun_token?(following) ||
317
+ (following[:category] == :latin && joinable_gap?(following[:gap], :latin)) ||
318
+ hiragana_suffix?(current, following)
319
+
320
+ gap = following[:gap]
321
+ return true if gap.empty?
322
+
323
+ gap.strip.empty?
324
+ end
325
+
326
+ def japanese_noun_token?(token)
327
+ [:kanji, :katakana].include?(token[:category])
328
+ end
329
+
330
+ def hiragana_suffix?(current, following)
331
+ return false unless current[:category] == :kanji
332
+ return false unless following[:category] == :hiragana
333
+ return false unless following[:gap].empty?
334
+
335
+ suffixes = configured_hiragana_suffixes
336
+ return true if suffixes.nil?
337
+
338
+ suffixes.include?(following[:raw])
339
+ end
340
+
341
+ def whitespace_gap?(gap)
342
+ gap.strip.empty?
343
+ end
344
+
345
+ def joinable_gap?(gap, category)
346
+ return true if whitespace_gap?(gap)
347
+
348
+ case category
349
+ when :latin
350
+ connector_gap?(gap)
351
+ else
352
+ false
353
+ end
354
+ end
355
+
356
+ def connector_gap?(gap)
357
+ return false if gap.nil?
358
+
359
+ stripped = gap.delete("\s")
360
+ return false if stripped.empty?
361
+
362
+ stripped.chars.all? { |char| latin_joiners.include?(char) }
363
+ end
364
+
365
+ def normalize_token(token, category)
366
+ normalized = token
367
+ .tr(FULLWIDTH_ALPHA, HALF_WIDTH_ALPHA)
368
+ .tr(FULLWIDTH_DIGITS, HALF_WIDTH_DIGITS)
369
+ .tr(FULLWIDTH_HYPHENS, "-")
370
+ .downcase
371
+
372
+ normalized = lemmatize_latin(normalized) if category == :latin
373
+ normalized
374
+ end
375
+
376
+ def detect_category(token)
377
+ return :kanji if token.match?(/\A\p{Han}+\z/)
378
+ return :hiragana if token.match?(/\A\p{Hiragana}+\z/)
379
+ return :katakana if token.match?(/\A[\p{Katakana}ー]+\z/)
380
+
381
+ :latin
382
+ end
383
+
384
+ def lemmatize_latin(token)
385
+ return token if token.length <= 3
386
+ return token if token.include?("-")
387
+ return token if token.match?(/\d/)
388
+ return token if token.end_with?("ss") || token.end_with?("us") || token.end_with?("is")
389
+
390
+ if token.end_with?("ies") && token.length > 3
391
+ token[0...-3] + "y"
392
+ elsif token.end_with?("es") && !token.end_with?("ses") && token.length > 3
393
+ token[0...-2]
394
+ elsif token.end_with?("s")
395
+ token[0...-1]
396
+ else
397
+ token
398
+ end
399
+ end
400
+
401
+ def valid_english_term?(normalized)
402
+ return false if normalized.empty?
403
+ return false if normalized.match?(/\A\d+\z/)
404
+
405
+ words = normalized.split(/\s+/)
406
+ return false if words.length == 1 && words.first.length <= 2
407
+
408
+ return false if EN_GENERIC_TERMS.include?(normalized)
409
+
410
+ true
411
+ end
412
+
413
+ def english_stopword?(word)
414
+ EN_STOPWORDS.include?(word)
415
+ end
416
+
417
+ def valid_japanese_term?(normalized)
418
+ return false if normalized.empty?
419
+ return false if normalized.length == 1
420
+ return false if normalized.match?(/\A[ぁ-ゖゝゞー]+\z/)
421
+ return false if normalized.match?(/\A\d+\z/)
422
+ return false if JA_GENERIC_TERMS.include?(normalized)
423
+
424
+ true
425
+ end
426
+
427
+ def prune_inclusions(entries)
428
+ accepted = []
429
+
430
+ entries.each do |entry|
431
+ next if accepted.any? { |chosen| contains_term?(chosen, entry) }
432
+
433
+ accepted << entry
434
+ end
435
+
436
+ accepted
437
+ end
438
+
439
+ def contains_term?(long_entry, short_entry)
440
+ return false if long_entry.equal?(short_entry)
441
+ return false if long_entry[:language] != short_entry[:language]
442
+ return false if long_entry[:token] == short_entry[:token]
443
+
444
+ case long_entry[:language]
445
+ when :en
446
+ long_words = long_entry[:token].split(/\s+/)
447
+ short_words = short_entry[:token].split(/\s+/)
448
+ return false if short_words.length > long_words.length
449
+
450
+ long_words.each_cons(short_words.length) do |slice|
451
+ return true if slice == short_words
452
+ end
453
+
454
+ false
455
+ when :ja
456
+ long_entry[:token].include?(short_entry[:token])
457
+ else
458
+ false
459
+ end
460
+ end
461
+
462
+ def text_for(node, selector, attribute: nil)
463
+ return "" unless node
464
+
465
+ elements = node.css(selector)
466
+ return "" if elements.empty?
467
+
468
+ texts = elements.map do |element|
469
+ if attribute
470
+ element[attribute].to_s
471
+ else
472
+ element.text
473
+ end
474
+ end
475
+
476
+ texts.join(" ")
477
+ end
478
+
479
+ def join_latin_sequence(sequence)
480
+ token_builder = String.new
481
+ normalized_builder = String.new
482
+
483
+ sequence.each_with_index do |component, index|
484
+ if index.zero?
485
+ token_builder << component[:raw]
486
+ normalized_builder << component[:normalized]
487
+ next
488
+ end
489
+
490
+ gap = component[:gap]
491
+
492
+ if connector_gap?(gap)
493
+ token_builder << gap
494
+ normalized_builder << gap_connector_representation(gap)
495
+ else
496
+ token_builder << " "
497
+ normalized_builder << " "
498
+ end
499
+
500
+ token_builder << component[:raw]
501
+ normalized_builder << component[:normalized]
502
+ end
503
+
504
+ { token: token_builder, normalized: normalized_builder }
505
+ end
506
+
507
+ def gap_connector_representation(gap)
508
+ stripped = gap.delete("\s")
509
+ return gap if stripped.empty?
510
+
511
+ stripped + (gap.match?(/\s+$/) ? " " : "")
512
+ end
513
+
514
+ def latin_joiners
515
+ @latin_joiners ||= Array(config_read("morphology.latin_joiners")).map(&:to_s)
516
+ end
517
+
518
+ def configured_hiragana_suffixes
519
+ @configured_hiragana_suffixes ||= begin
520
+ suffixes = config_read("morphology.japanese_hiragana_suffixes")
521
+ suffixes&.map(&:to_s)
522
+ end
523
+ end
524
+
525
+ def configured_japanese_category_breaks
526
+ @configured_japanese_category_breaks ||= begin
527
+ entries = Array(config_read("morphology.japanese_category_breaks"))
528
+ entries.each_with_object(Set.new) do |entry, set|
529
+ next unless entry
530
+
531
+ from, to = entry.to_s.split(/_to_/, 2)
532
+ from_category = CATEGORY_ALIASES[from]
533
+ to_category = CATEGORY_ALIASES[to]
534
+
535
+ set << [from_category, to_category] if from_category && to_category
536
+ end
537
+ end
538
+ end
539
+
540
+ def japanese_category_break?(current, following)
541
+ breaks = configured_japanese_category_breaks
542
+ return false if breaks.empty?
543
+
544
+ breaks.include?([current[:category], following[:category]])
545
+ end
546
+
547
+ def config_read(key)
548
+ return nil unless @config
549
+
550
+ @config.read(key)
551
+ rescue NoMethodError
552
+ nil
553
+ end
554
+ end
555
+ end
556
+ end
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "cgi"
4
+ require "json"
5
+ require "net/http"
6
+ require "uri"
7
+
8
+ require_relative "../http"
9
+
10
+ module Coelacanth
11
+ class Extractor
12
+ # Applies pre-processing steps before running the main extraction pipeline.
13
+ class Preprocessor
14
+ def initialize(preprocessors: default_preprocessors)
15
+ @preprocessors = preprocessors
16
+ end
17
+
18
+ def call(html:, url: nil)
19
+ return html if url.nil?
20
+
21
+ @preprocessors.each do |preprocessor|
22
+ processed = preprocessor.call(html: html, url: url)
23
+ return processed if processed
24
+ end
25
+
26
+ html
27
+ end
28
+
29
+ private
30
+
31
+ def default_preprocessors
32
+ [Preprocessors::YouTube.new]
33
+ end
34
+
35
+ module Preprocessors
36
+ # Converts YouTube video pages into structured article-like HTML using the
37
+ # YouTube Data API. This allows the downstream extractor to consume the
38
+ # video description and thumbnail as if it were a standard article.
39
+ class YouTube
40
+ API_ENDPOINT = "https://www.googleapis.com/youtube/v3/videos"
41
+ WATCH_HOSTS = %w[youtube.com www.youtube.com youtu.be m.youtube.com music.youtube.com].freeze
42
+
43
+ def call(html:, url:)
44
+ video_id = extract_video_id(url)
45
+ return unless video_id
46
+
47
+ api_key = youtube_api_key
48
+ return if api_key.nil? || api_key.empty?
49
+
50
+ snippet = fetch_snippet(video_id, api_key)
51
+ return unless snippet
52
+
53
+ build_document(snippet)
54
+ end
55
+
56
+ private
57
+
58
+ def extract_video_id(url)
59
+ uri = URI.parse(url)
60
+ return unless WATCH_HOSTS.include?(uri.host)
61
+
62
+ if uri.host == "youtu.be"
63
+ uri.path.split("/").reject(&:empty?).first
64
+ else
65
+ params = URI.decode_www_form(uri.query.to_s).to_h
66
+ params["v"].to_s.strip
67
+ end
68
+ rescue URI::InvalidURIError
69
+ nil
70
+ end
71
+
72
+ def youtube_api_key
73
+ Coelacanth.config.read("youtube.api_key").to_s.strip
74
+ rescue StandardError
75
+ ""
76
+ end
77
+
78
+ def fetch_snippet(video_id, api_key)
79
+ uri = URI(API_ENDPOINT)
80
+ uri.query = URI.encode_www_form(part: "snippet", id: video_id, key: api_key)
81
+
82
+ response = Coelacanth::HTTP.raw_get_response(uri)
83
+ return unless response.is_a?(Net::HTTPSuccess)
84
+
85
+ payload = JSON.parse(response.body.to_s)
86
+ payload.fetch("items", []).first&.fetch("snippet", nil)
87
+ rescue Coelacanth::TimeoutError, JSON::ParserError, StandardError
88
+ nil
89
+ end
90
+
91
+ def build_document(snippet)
92
+ title = snippet["title"].to_s.strip
93
+ description = snippet["description"].to_s
94
+ published_at = snippet["publishedAt"].to_s.strip
95
+ thumbnail_url = preferred_thumbnail(snippet["thumbnails"])
96
+
97
+ body_html = render_description(description)
98
+ thumbnail_markup = render_thumbnail(thumbnail_url, title)
99
+ article_html = "#{thumbnail_markup}#{body_html}"
100
+
101
+ jsonld = {
102
+ "@context" => "https://schema.org",
103
+ "@type" => "Article",
104
+ "headline" => title,
105
+ "datePublished" => published_at,
106
+ "articleBody" => article_html
107
+ }.to_json
108
+
109
+ <<~HTML
110
+ <html data-preprocessor="youtube">
111
+ <head>
112
+ <title>#{escape_html(title)}</title>
113
+ <meta property="article:published_time" content="#{escape_html(published_at)}" />
114
+ <meta property="og:image" content="#{escape_html(thumbnail_url)}" />
115
+ <script type="application/ld+json">#{jsonld}</script>
116
+ </head>
117
+ <body>
118
+ <article>
119
+ <h1>#{escape_html(title)}</h1>
120
+ #{article_html}
121
+ </article>
122
+ </body>
123
+ </html>
124
+ HTML
125
+ end
126
+
127
+ def render_description(description)
128
+ blocks = description.split(/\r?\n{2,}/).map(&:strip).reject(&:empty?)
129
+ return "<p></p>" if blocks.empty?
130
+
131
+ blocks.map do |block|
132
+ lines = block.split(/\r?\n/).map { |line| escape_html(line) }
133
+ "<p>#{lines.join('<br />')}</p>"
134
+ end.join
135
+ end
136
+
137
+ def render_thumbnail(thumbnail_url, title)
138
+ return "" if thumbnail_url.to_s.strip.empty?
139
+
140
+ <<~HTML
141
+ <figure>
142
+ <img src="#{escape_html(thumbnail_url)}" alt="#{escape_html(title)} thumbnail" />
143
+ </figure>
144
+ HTML
145
+ end
146
+
147
+ def preferred_thumbnail(thumbnails)
148
+ return "" unless thumbnails.is_a?(Hash)
149
+
150
+ %w[maxres standard high medium default].each do |size|
151
+ url = thumbnails.dig(size, "url").to_s.strip
152
+ return url unless url.empty?
153
+ end
154
+
155
+ ""
156
+ end
157
+
158
+ def escape_html(value)
159
+ CGI.escapeHTML(value.to_s)
160
+ end
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
166
+