coelacanth 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative "extractor/preprocessor"
3
4
  require_relative "extractor/normalizer"
4
5
  require_relative "extractor/metadata_probe"
5
6
  require_relative "extractor/heuristic_probe"
@@ -8,6 +9,9 @@ require_relative "extractor/fallback_probe"
8
9
  require_relative "extractor/markdown_renderer"
9
10
  require_relative "extractor/image_collector"
10
11
  require_relative "extractor/markdown_listing_collector"
12
+ require_relative "extractor/eyecatch_image_extractor"
13
+ require_relative "extractor/morphological_analyzer"
14
+ require_relative "extractor/utilities"
11
15
 
12
16
  module Coelacanth
13
17
  # High-level API for extracting articles without site-specific selectors.
@@ -22,8 +26,9 @@ module Coelacanth
22
26
  keyword_init: true
23
27
  )
24
28
 
25
- def call(html:, url: nil)
26
- document = Normalizer.new.call(html: html, base_url: url)
29
+ def call(html:, url: nil, response_metadata: nil)
30
+ preprocessed_html = Preprocessor.new.call(html: html, url: url)
31
+ document = Normalizer.new.call(html: preprocessed_html, base_url: url)
27
32
 
28
33
  [
29
34
  [MetadataProbe.new, 0.85],
@@ -34,34 +39,64 @@ module Coelacanth
34
39
  result = probe.call(doc: document, url: url)
35
40
  next unless result
36
41
 
37
- return build_response(result, document:, url:) if result.confidence.to_f >= threshold
42
+ return build_response(result, document:, url:, response_metadata: response_metadata) if result.confidence.to_f >= threshold
38
43
  end
39
44
 
40
45
  build_response(
41
46
  PipelineResult.new(node: document, source_tag: :none, confidence: 0.0),
42
47
  document: document,
43
- url: url
48
+ url: url,
49
+ response_metadata: response_metadata
44
50
  )
45
51
  end
46
52
 
47
53
  private
48
54
 
49
- def build_response(result, document:, url:)
55
+ def build_response(result, document:, url:, response_metadata:)
50
56
  node = result.node
51
57
  body_markdown = MarkdownRenderer.render(node)
52
58
  body_markdown_list = body_markdown.to_s.split(/\n{2,}/).map { |segment| segment.strip }.reject(&:empty?)
59
+ body_morphemes = MorphologicalAnalyzer.new(config: Coelacanth.config).call(
60
+ node: node,
61
+ title: result.title,
62
+ markdown: body_markdown
63
+ )
64
+
65
+ site_name = extract_site_name(document)
66
+ body_text = extract_body_text(node)
53
67
 
54
68
  {
55
69
  title: result.title,
56
70
  body_markdown: body_markdown,
57
71
  body_markdown_list: body_markdown_list,
72
+ body_morphemes: body_morphemes,
58
73
  images: ImageCollector.new.call(node),
74
+ eyecatch_image: EyecatchImageExtractor.new.call(doc: document, base_url: url),
59
75
  published_at: result.published_at,
60
76
  byline: result.byline,
61
77
  source: result.source_tag,
62
78
  confidence: result.confidence,
63
- listings: MarkdownListingCollector.new.call(markdown: body_markdown, base_url: url)
79
+ listings: MarkdownListingCollector.new.call(markdown: body_markdown, base_url: url),
80
+ site_name: site_name,
81
+ body_text: body_text,
82
+ response_metadata: response_metadata || {}
64
83
  }
65
84
  end
85
+
86
+ def extract_site_name(document)
87
+ Utilities.meta_content(
88
+ document,
89
+ "meta[property='og:site_name']",
90
+ "meta[name='application-name']",
91
+ "meta[name='apple-mobile-web-app-title']",
92
+ "meta[name='twitter:site']"
93
+ ) || document.at_css("title")&.text&.strip
94
+ end
95
+
96
+ def extract_body_text(node)
97
+ return if node.nil?
98
+
99
+ node.text.to_s.gsub(/\s+/, " ").strip
100
+ end
66
101
  end
67
102
  end
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "delegate"
3
4
  require "net/http"
4
5
  require "open-uri"
5
6
  require "timeout"
@@ -14,7 +15,30 @@ module Coelacanth
14
15
  DEFAULT_READ_TIMEOUT = 10
15
16
  MAX_RETRIES = 2
16
17
 
17
- ErrorResponse = Struct.new(:status, :meta, :base_uri, :body, keyword_init: true) do
18
+ Response = Class.new(SimpleDelegator) do
19
+ attr_reader :status_code, :headers, :final_uri
20
+
21
+ def initialize(response, final_uri: nil)
22
+ super(response)
23
+ @status_code = response.respond_to?(:code) ? response.code.to_i : nil
24
+ @headers = response.respond_to?(:each_header) ? response.each_header.to_h : {}
25
+ @final_uri = (response.respond_to?(:uri) ? response.uri : nil) || final_uri
26
+ end
27
+
28
+ def final_url
29
+ final_uri&.to_s
30
+ end
31
+
32
+ def is_a?(klass)
33
+ super || __getobj__.is_a?(klass)
34
+ end
35
+
36
+ def kind_of?(klass)
37
+ is_a?(klass)
38
+ end
39
+ end
40
+
41
+ ErrorResponse = Struct.new(:status, :meta, :base_uri, :final_uri, :body, keyword_init: true) do
18
42
  def string
19
43
  body.to_s
20
44
  end
@@ -26,7 +50,8 @@ module Coelacanth
26
50
 
27
51
  def get_response(uri, open_timeout: DEFAULT_OPEN_TIMEOUT, read_timeout: DEFAULT_READ_TIMEOUT, retries: MAX_RETRIES)
28
52
  ensure_allowed!(uri)
29
- raw_get_response(uri, open_timeout: open_timeout, read_timeout: read_timeout, retries: retries)
53
+ response = raw_get_response(uri, open_timeout: open_timeout, read_timeout: read_timeout, retries: retries)
54
+ Response.new(response, final_uri: uri)
30
55
  end
31
56
 
32
57
  def raw_get_response(uri, open_timeout: DEFAULT_OPEN_TIMEOUT, read_timeout: DEFAULT_READ_TIMEOUT, retries: MAX_RETRIES)
@@ -63,6 +88,7 @@ module Coelacanth
63
88
  status: [response.code, response.message],
64
89
  meta: response.each_header.to_h,
65
90
  base_uri: uri,
91
+ final_uri: response.respond_to?(:uri) ? response.uri : uri,
66
92
  body: response.body
67
93
  )
68
94
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Coelacanth
4
- VERSION = "0.4.3"
4
+ VERSION = "0.5.0"
5
5
  end
data/lib/coelacanth.rb CHANGED
@@ -29,15 +29,21 @@ module Coelacanth
29
29
  rescue Coelacanth::TimeoutError
30
30
  nil
31
31
  end
32
+ response_metadata = {
33
+ status_code: response&.status_code,
34
+ headers: response&.headers || {},
35
+ final_url: response&.final_url || regular_url
36
+ }
32
37
  html = response&.body.to_s
33
38
  html = html.dup
34
39
  html = html.force_encoding(Encoding::UTF_8)
35
40
  html = html.encode(Encoding::UTF_8, invalid: :replace, undef: :replace)
36
- extractor_result = Extractor.new.call(html: html, url: regular_url)
41
+ extractor_result = Extractor.new.call(html: html, url: regular_url, response_metadata: response_metadata)
37
42
  {
38
43
  dom: Dom.new.oga(regular_url, html: html),
39
44
  screenshot: @client.get_screenshot,
40
45
  extraction: extractor_result,
46
+ response: response_metadata
41
47
  }
42
48
  end
43
49
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: coelacanth
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yusuke
@@ -37,13 +37,16 @@ files:
37
37
  - lib/coelacanth/configure.rb
38
38
  - lib/coelacanth/dom.rb
39
39
  - lib/coelacanth/extractor.rb
40
+ - lib/coelacanth/extractor/eyecatch_image_extractor.rb
40
41
  - lib/coelacanth/extractor/fallback_probe.rb
41
42
  - lib/coelacanth/extractor/heuristic_probe.rb
42
43
  - lib/coelacanth/extractor/image_collector.rb
43
44
  - lib/coelacanth/extractor/markdown_listing_collector.rb
44
45
  - lib/coelacanth/extractor/markdown_renderer.rb
45
46
  - lib/coelacanth/extractor/metadata_probe.rb
47
+ - lib/coelacanth/extractor/morphological_analyzer.rb
46
48
  - lib/coelacanth/extractor/normalizer.rb
49
+ - lib/coelacanth/extractor/preprocessor.rb
47
50
  - lib/coelacanth/extractor/utilities.rb
48
51
  - lib/coelacanth/extractor/weak_ml_probe.rb
49
52
  - lib/coelacanth/http.rb