web-author 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +7 -0
  2. data/.ruby-version +1 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +234 -0
  6. data/Rakefile +12 -0
  7. data/lib/web_author/author/strategies/author_from_ld_schema.rb +54 -0
  8. data/lib/web_author/author/strategies/author_from_meta.rb +16 -0
  9. data/lib/web_author/author/strategy.rb +25 -0
  10. data/lib/web_author/json_ld_schema_processor.rb +46 -0
  11. data/lib/web_author/ld_author.rb +30 -0
  12. data/lib/web_author/ld_schema.rb +74 -0
  13. data/lib/web_author/page.rb +54 -0
  14. data/lib/web_author/version.rb +5 -0
  15. data/lib/web_author.rb +14 -0
  16. data/sig/webauthor.rbs +4 -0
  17. data/sorbet/config +4 -0
  18. data/sorbet/rbi/annotations/.gitattributes +1 -0
  19. data/sorbet/rbi/annotations/minitest.rbi +119 -0
  20. data/sorbet/rbi/annotations/mocha.rbi +34 -0
  21. data/sorbet/rbi/annotations/rainbow.rbi +269 -0
  22. data/sorbet/rbi/annotations/webmock.rbi +9 -0
  23. data/sorbet/rbi/gems/.gitattributes +1 -0
  24. data/sorbet/rbi/gems/addressable.rbi +203 -0
  25. data/sorbet/rbi/gems/addressable@2.8.7.rbi +1994 -0
  26. data/sorbet/rbi/gems/ast.rbi +49 -0
  27. data/sorbet/rbi/gems/ast@2.4.3.rbi +585 -0
  28. data/sorbet/rbi/gems/benchmark@0.4.0.rbi +618 -0
  29. data/sorbet/rbi/gems/bigdecimal.rbi +86 -0
  30. data/sorbet/rbi/gems/bigdecimal@3.1.9.rbi +8 -0
  31. data/sorbet/rbi/gems/crack.rbi +62 -0
  32. data/sorbet/rbi/gems/crack@1.0.0.rbi +145 -0
  33. data/sorbet/rbi/gems/erubi@1.13.1.rbi +155 -0
  34. data/sorbet/rbi/gems/hashdiff.rbi +66 -0
  35. data/sorbet/rbi/gems/hashdiff@1.1.2.rbi +353 -0
  36. data/sorbet/rbi/gems/json@2.10.2.rbi +2112 -0
  37. data/sorbet/rbi/gems/language_server-protocol.rbi +2868 -0
  38. data/sorbet/rbi/gems/language_server-protocol@3.17.0.4.rbi +9 -0
  39. data/sorbet/rbi/gems/lint_roller.rbi +75 -0
  40. data/sorbet/rbi/gems/lint_roller@1.1.0.rbi +240 -0
  41. data/sorbet/rbi/gems/logger@1.7.0.rbi +963 -0
  42. data/sorbet/rbi/gems/minitest.rbi +440 -0
  43. data/sorbet/rbi/gems/minitest@5.25.5.rbi +1547 -0
  44. data/sorbet/rbi/gems/mocha.rbi +653 -0
  45. data/sorbet/rbi/gems/mocha@2.7.1.rbi +12 -0
  46. data/sorbet/rbi/gems/netrc@0.11.0.rbi +159 -0
  47. data/sorbet/rbi/gems/nokogiri-1.18.6-arm64.rbi +1135 -0
  48. data/sorbet/rbi/gems/parallel.rbi +88 -0
  49. data/sorbet/rbi/gems/parallel@1.26.3.rbi +291 -0
  50. data/sorbet/rbi/gems/parser.rbi +1544 -0
  51. data/sorbet/rbi/gems/parser@3.3.7.3.rbi +5532 -0
  52. data/sorbet/rbi/gems/prism.rbi +4090 -0
  53. data/sorbet/rbi/gems/prism@1.4.0.rbi +41732 -0
  54. data/sorbet/rbi/gems/public_suffix.rbi +105 -0
  55. data/sorbet/rbi/gems/public_suffix@6.0.1.rbi +936 -0
  56. data/sorbet/rbi/gems/racc.rbi +15 -0
  57. data/sorbet/rbi/gems/racc@1.8.1.rbi +160 -0
  58. data/sorbet/rbi/gems/rainbow.rbi +122 -0
  59. data/sorbet/rbi/gems/rainbow@3.1.1.rbi +403 -0
  60. data/sorbet/rbi/gems/rake.rbi +650 -0
  61. data/sorbet/rbi/gems/rake@13.2.1.rbi +3033 -0
  62. data/sorbet/rbi/gems/rbi@0.3.1.rbi +6599 -0
  63. data/sorbet/rbi/gems/rbs@3.9.2.rbi +6978 -0
  64. data/sorbet/rbi/gems/rdoc.rbi +555 -0
  65. data/sorbet/rbi/gems/regexp_parser.rbi +1039 -0
  66. data/sorbet/rbi/gems/regexp_parser@2.10.0.rbi +3795 -0
  67. data/sorbet/rbi/gems/rexml.rbi +637 -0
  68. data/sorbet/rbi/gems/rexml@3.4.1.rbi +5346 -0
  69. data/sorbet/rbi/gems/rubocop-ast.rbi +1470 -0
  70. data/sorbet/rbi/gems/rubocop-ast@1.43.0.rbi +7765 -0
  71. data/sorbet/rbi/gems/rubocop-minitest.rbi +450 -0
  72. data/sorbet/rbi/gems/rubocop-minitest@0.37.1.rbi +2609 -0
  73. data/sorbet/rbi/gems/rubocop-performance.rbi +593 -0
  74. data/sorbet/rbi/gems/rubocop-performance@1.24.0.rbi +3359 -0
  75. data/sorbet/rbi/gems/rubocop-rake.rbi +87 -0
  76. data/sorbet/rbi/gems/rubocop-rake@0.7.1.rbi +328 -0
  77. data/sorbet/rbi/gems/rubocop-rubycw.rbi +40 -0
  78. data/sorbet/rbi/gems/rubocop-rubycw@0.2.2.rbi +91 -0
  79. data/sorbet/rbi/gems/rubocop.rbi +10554 -0
  80. data/sorbet/rbi/gems/rubocop@1.75.1.rbi +61875 -0
  81. data/sorbet/rbi/gems/ruby-progressbar.rbi +321 -0
  82. data/sorbet/rbi/gems/ruby-progressbar@1.13.0.rbi +1318 -0
  83. data/sorbet/rbi/gems/ruby2_keywords@0.0.5.rbi +9 -0
  84. data/sorbet/rbi/gems/spoom@1.6.1.rbi +7274 -0
  85. data/sorbet/rbi/gems/tapioca@0.16.11.rbi +3628 -0
  86. data/sorbet/rbi/gems/thor@1.3.2.rbi +4378 -0
  87. data/sorbet/rbi/gems/unicode-display_width.rbi +28 -0
  88. data/sorbet/rbi/gems/unicode-display_width@3.1.4.rbi +132 -0
  89. data/sorbet/rbi/gems/unicode-emoji.rbi +18 -0
  90. data/sorbet/rbi/gems/unicode-emoji@4.0.4.rbi +251 -0
  91. data/sorbet/rbi/gems/web_author.rbi +20 -0
  92. data/sorbet/rbi/gems/webmock.rbi +512 -0
  93. data/sorbet/rbi/gems/webmock@3.25.1.rbi +1792 -0
  94. data/sorbet/rbi/gems/yard-sorbet@0.9.0.rbi +435 -0
  95. data/sorbet/rbi/gems/yard.rbi +36 -0
  96. data/sorbet/rbi/gems/yard@0.9.37.rbi +18379 -0
  97. data/sorbet/rbi/gems/zeitwerk.rbi +240 -0
  98. data/sorbet/rbi/gems/zeitwerk@2.7.2.rbi +1141 -0
  99. data/sorbet/tapioca/config.yml +13 -0
  100. data/sorbet/tapioca/require.rb +4 -0
  101. metadata +188 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 7b0531fe79190a66e3119213938b8cf6236bf104b2d2f94efb7f955c149befcf
4
+ data.tar.gz: 57ea9eafe75368e6d28d8ea631690c44c682d677cf9b3ea49c1780d1c811619b
5
+ SHA512:
6
+ metadata.gz: c15ab0e076f4a1956dbddf3be9f2fe918fccb01966650427fe32e292d65783cffa857e88295d28878f878cd36a215f1c3a1469848d608503fc72270cbf2f8a91
7
+ data.tar.gz: cac9b4fb83ecbf69c176a2d4e365876c9290fcf429141a0923afa49e6140999226a8c06d4058cdde1eaeb6d266b5bd681a896001e232f6448ea4b184e77aab59
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 3.4.1
data/CHANGELOG.md ADDED
@@ -0,0 +1,5 @@
1
+ # [Releases]
2
+
3
+ ## [0.1.0] - 2025-04-14
4
+
5
+ - Initial release with support for meta tags and JSON-LD schema
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2025 Lucian Ghinda
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,234 @@
1
+ # WebAuthor
2
+
3
+ WebAuthor is a Ruby gem that extracts author information from web pages using multiple strategies. It can detect authors from both meta tags and JSON-LD schema, providing a reliable way to identify content creators.
4
+
5
+ ## Features
6
+
7
+ - Extract author information from HTML meta tags
8
+ - Extract author information from JSON-LD schema (schema.org)
9
+ - Support for multiple authors in a single page
10
+ - Fallback strategy - tries different methods until an author is found
11
+ - Clean, type-safe code with Sorbet
12
+
13
+ ## Installation
14
+
15
+ Add this line to your application's Gemfile:
16
+
17
+ ```ruby
18
+ gem 'web-author'
19
+ ```
20
+
21
+ And then execute:
22
+
23
+ ```bash
24
+ $ bundle install
25
+ ```
26
+
27
+ Or install it yourself as:
28
+
29
+ ```bash
30
+ $ gem install web-author
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ ### Basic Usage
36
+
37
+ ```ruby
38
+ require 'web_author'
39
+
40
+ # Create a new Page object with a URL
41
+ page = WebAuthor::Page.new(url: 'https://example.com/article')
42
+
43
+ # Get the author of the page
44
+ author = page.author
45
+ # => "John Doe"
46
+ ```
47
+
48
+ WebAuthor will first try to find author information in JSON-LD schema data, then fall back to meta tags if needed.
49
+
50
+ ### Handling Multiple Authors
51
+
52
+ If a page has multiple authors in the JSON-LD schema, WebAuthor returns them as a comma-separated string:
53
+
54
+ ```ruby
55
+ page = WebAuthor::Page.new(url: 'https://example.com/collaboration-article')
56
+ authors = page.author
57
+ # => "Jane Smith, Bob Johnson"
58
+ ```
59
+
60
+ ### Error Handling
61
+
62
+ WebAuthor raises `WebAuthor::Error` when it encounters problems fetching the page:
63
+
64
+ ```ruby
65
+ begin
66
+ page = WebAuthor::Page.new(url: 'https://example.com/article')
67
+ author = page.author
68
+ rescue WebAuthor::Error => e
69
+ puts "Failed to get author: #{e.message}"
70
+ end
71
+ ```
72
+
73
+ ## How It Works
74
+
75
+ WebAuthor uses a strategy to extract author information:
76
+
77
+ 1. First, it tries to find author information in JSON-LD schema (often found in `<script type="application/ld+json">` tags)
78
+ 2. If no author is found in JSON-LD, it looks for a meta tag with the name "author" (`<meta name="author" content="Author Name">`)
79
+ 3. If no author is found using any strategy, it returns `nil`
80
+
81
+ ## Supported Author Formats
82
+
83
+ ### Meta Tags
84
+
85
+ ```html
86
+ <meta name="author" content="Author Name" />
87
+ ```
88
+
89
+ ### JSON-LD Schema
90
+
91
+ Single author:
92
+
93
+ ```html
94
+ <script type="application/ld+json">
95
+ {
96
+ "@context": "https://schema.org",
97
+ "@type": "Article",
98
+ "author": {
99
+ "@type": "Person",
100
+ "name": "Author Name"
101
+ }
102
+ }
103
+ </script>
104
+ ```
105
+
106
+ Multiple authors:
107
+
108
+ ```html
109
+ <script type="application/ld+json">
110
+ {
111
+ "@context": "https://schema.org",
112
+ "@type": "Article",
113
+ "author": [
114
+ {
115
+ "@type": "Person",
116
+ "name": "First Author"
117
+ },
118
+ {
119
+ "@type": "Person",
120
+ "name": "Second Author"
121
+ }
122
+ ]
123
+ }
124
+ </script>
125
+ ```
126
+
127
+ ## Requirements
128
+
129
+ - Ruby 3.4 or higher
130
+ - Nokogiri
131
+ - Sorbet Runtime
132
+ - Zeitwerk
133
+
134
+ ## Development
135
+
136
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
137
+
138
+ To install this gem onto your local machine, run `bundle exec rake install`.
139
+
140
+ ### Development Workflow
141
+
142
+ #### Type Checking with Sorbet
143
+
144
+ This project uses [Sorbet](https://sorbet.org/) for static type checking. To run the type checker:
145
+
146
+ ```bash
147
+ $ bin/type-check
148
+ ```
149
+
150
+ or directly:
151
+
152
+ ```bash
153
+ $ bundle exec srb tc
154
+ ```
155
+
156
+ #### Running Tests
157
+
158
+ Run all tests using:
159
+
160
+ ```bash
161
+ $ bundle exec rake test
162
+ ```
163
+
164
+ Run a specific test file:
165
+
166
+ ```bash
167
+ $ bundle exec ruby -Ilib:test test/web_author/page_test.rb
168
+ ```
169
+
170
+ #### Code Style and Linting
171
+
172
+ This project follows Ruby style guidelines enforced by RuboCop. Run the linter with:
173
+
174
+ ```bash
175
+ $ bundle exec rubocop
176
+ ```
177
+
178
+ Auto-fix issues when possible:
179
+
180
+ ```bash
181
+ $ bundle exec rubocop -A
182
+ ```
183
+
184
+ #### Running All Checks
185
+
186
+ The default Rake task runs both tests and RuboCop:
187
+
188
+ ```bash
189
+ $ bundle exec rake
190
+ ```
191
+
192
+ ### Working with Sorbet
193
+
194
+ WebAuthor uses Sorbet for static type checking. When adding new code:
195
+
196
+ 1. Add comment on top of the file: `# typed: strict`
197
+ 2. Add type signatures to methods using `sig` blocks
198
+ 3. Run `bin/type-check` to verify type safety
199
+
200
+ Example of typed code:
201
+
202
+ ```ruby
203
+ extend T::Sig
204
+
205
+ sig { params(url: String).void }
206
+ def initialize(url:)
207
+ @url = T.let(url, String)
208
+ @content = T.let(nil, T.nilable(String))
209
+ end
210
+
211
+ sig { returns(T.nilable(String)) }
212
+ def author
213
+ # method implementation
214
+ end
215
+ ```
216
+
217
+ ### Adding a new strategy
218
+
219
+ You should create a new class that inherits from `WebAuthor::Strategy` and implement the `author` method.
220
+ You will notice that you will get the `document` from the initializer as every strategy receives it. This is a `Nokogiri::XML::Document` object.
221
+
222
+ ## Contributing
223
+
224
+ 1. Fork it
225
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
226
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
227
+ 4. Push to the branch (`git push origin my-new-feature`)
228
+ 5. Create a new Pull Request
229
+
230
+ Bug reports and pull requests are welcome on GitHub at https://github.com/lucianghinda/web_author.
231
+
232
+ ## License
233
+
234
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'minitest/test_task'
5
+
6
+ Minitest::TestTask.create
7
+
8
+ require 'rubocop/rake_task'
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i(test rubocop)
@@ -0,0 +1,54 @@
1
+ # typed: strict
2
+ # frozen_string_literal: true
3
+
4
+ module WebAuthor
5
+ module Author
6
+ module Strategies
7
+ class AuthorFromLdSchema < Strategy
8
+ extend T::Sig
9
+
10
+ sig { override.params(document: Nokogiri::XML::Document).void }
11
+ def initialize(document)
12
+ @_schemas = T.let(nil, T.nilable(T::Array[LdSchema]))
13
+ super
14
+ end
15
+
16
+ sig { override.returns(T.nilable(String)) }
17
+ def author
18
+ return nil if schemas.empty?
19
+
20
+ all_author_names = author_names
21
+ return nil if all_author_names.empty?
22
+
23
+ all_author_names.uniq.join(', ')
24
+ end
25
+
26
+ private
27
+
28
+ sig { returns(T::Array[LdSchema]) }
29
+ def schemas
30
+ @_schemas ||= JsonLdSchemaProcessor.new(document:).schemas
31
+ end
32
+
33
+ sig { returns(T::Array[String]) }
34
+ def author_names
35
+ names = []
36
+ schemas.each do |schema|
37
+ author = schema.parsed_author
38
+ next if author.nil?
39
+
40
+ current_names = if author.is_a?(Array)
41
+ author.filter_map(&:name)
42
+ else
43
+ [author.name].compact
44
+ end
45
+
46
+ names.concat(current_names) unless current_names.empty?
47
+ end
48
+
49
+ names
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,16 @@
1
+ # typed: strict
2
+ # frozen_string_literal: true
3
+
4
+ module WebAuthor
5
+ module Author
6
+ module Strategies
7
+ class AuthorFromMeta < Strategy
8
+ sig { override.returns(T.nilable(String)) }
9
+ def author
10
+ meta_author = document.at_css('meta[name="author"]')
11
+ meta_author&.attribute('content')&.value
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,25 @@
1
+ # typed: strict
2
+ # frozen_string_literal: true
3
+
4
+ module WebAuthor
5
+ module Author
6
+ class Strategy
7
+ extend T::Sig
8
+ extend T::Helpers
9
+ abstract!
10
+
11
+ sig { overridable.params(document: Nokogiri::XML::Document).void }
12
+ def initialize(document)
13
+ @document = document
14
+ end
15
+
16
+ sig { abstract.returns(T.nilable(String)) }
17
+ def author; end
18
+
19
+ private
20
+
21
+ sig { overridable.returns(Nokogiri::XML::Document) }
22
+ attr_reader :document
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,46 @@
1
+ # typed: strict
2
+ # frozen_string_literal: true
3
+
4
+ require 'json'
5
+
6
+ module WebAuthor
7
+ class JsonLdSchemaProcessor
8
+ extend T::Sig
9
+
10
+ JSON_LD_SCHEMA_XPATH = '//script[@type="application/ld+json"]'
11
+
12
+ sig { params(document: Nokogiri::XML::Document).void }
13
+ def initialize(document:)
14
+ @document = T.let(document, Nokogiri::XML::Document)
15
+ @_schemas = T.let(nil, T.nilable(T::Array[LdSchema]))
16
+ end
17
+
18
+ sig { returns(T::Array[LdSchema]) }
19
+ def schemas
20
+ @_schemas ||= extract_schemas
21
+ end
22
+
23
+ private
24
+
25
+ sig { returns(Nokogiri::XML::Document) }
26
+ attr_reader :document
27
+
28
+ sig { returns(T::Array[LdSchema]) }
29
+ def extract_schemas
30
+ json_ld_script_tags.filter_map do |script_tag|
31
+ content = script_tag.content.strip
32
+ next if content.empty?
33
+
34
+ json_data = JSON.parse(content)
35
+ LdSchema.from_hash(json_data)
36
+ rescue JSON::ParserError
37
+ # Skip invalid JSON as we don't need to process it but we want to
38
+ # let the processing move on to the next script tag that is JSON-LD
39
+ nil
40
+ end
41
+ end
42
+
43
+ sig { returns(Nokogiri::XML::NodeSet) }
44
+ def json_ld_script_tags = document.xpath(JSON_LD_SCHEMA_XPATH)
45
+ end
46
+ end
@@ -0,0 +1,30 @@
1
+ # typed: true
2
+ # frozen_string_literal: true
3
+
4
+ module WebAuthor
5
+ class LdAuthor < T::Struct
6
+ extend T::Sig
7
+
8
+ const :type, T.nilable(String), default: nil
9
+ const :name, T.nilable(String), default: nil
10
+ const :url, T.nilable(String), default: nil
11
+ const :additional_properties, T::Hash[String, T.untyped], default: {}
12
+
13
+ ATTRIBUTES = T.let(['@type', 'name', 'url'].freeze, T::Array[String])
14
+
15
+ sig { params(hash: T.any(String, T::Hash[String, T.untyped])).returns(T.attached_class) }
16
+ def self.from_hash(hash)
17
+ return new(name: hash) if hash.is_a?(String)
18
+
19
+ main_properties = hash.dup.select { |key, _| ATTRIBUTES.include?(key) }
20
+ main_properties['type'] = main_properties.delete('@type')
21
+ main_properties.transform_keys!(&:to_sym)
22
+
23
+ additional_properties = hash.dup
24
+ ATTRIBUTES.each { |it| additional_properties.delete(it) }
25
+ additional_properties.transform_keys!(&:to_s)
26
+
27
+ new(**main_properties, additional_properties:)
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,74 @@
1
+ # typed: strict
2
+ # frozen_string_literal: true
3
+
4
+ require 'json'
5
+
6
+ module WebAuthor
7
+ class LdSchema < T::Struct
8
+ extend T::Sig
9
+
10
+ AuthorBaseType = T.type_alias { T.any(String, T::Hash[String, T.untyped]) }
11
+ ArrayofAuthors = T.type_alias { T::Array[AuthorBaseType] }
12
+ AuthorType = T.type_alias { T.any(AuthorBaseType, ArrayofAuthors) }
13
+
14
+ const :context, T.nilable(T.any(String, T::Hash[String, T.untyped])), default: nil
15
+ const :type, T.nilable(T.any(String, T::Array[String])), default: nil
16
+ const :id, T.nilable(String), default: nil
17
+ const :name, T.untyped, default: nil
18
+ const :description, T.untyped, default: nil
19
+ const :headline, T.untyped, default: nil
20
+ const :url, T.untyped, default: nil
21
+ const :image, T.untyped, default: nil
22
+ const :author, T.nilable(AuthorType), default: nil
23
+ const :publisher, T.nilable(T.any(String, T::Hash[String, T.untyped])), default: nil
24
+ const :blog_post, T.nilable(T::Array[T::Hash[String, T.untyped]]), default: nil
25
+ const :additional_properties, T::Hash[String, T.untyped], default: {}
26
+
27
+ ATTRIBUTES = T.let([
28
+ '@context', '@type', '@id', 'name', 'description', 'url', 'image',
29
+ 'author', 'headline', 'publisher', 'blogPost'
30
+ ].freeze, T::Array[String])
31
+
32
+ sig { params(script_html_tag: String).returns(T.attached_class) }
33
+ def self.from_script_tag(script_html_tag)
34
+ hash = JSON.parse(script_html_tag)
35
+ from_hash(hash)
36
+ end
37
+
38
+ sig { params(hash: T::Hash[String, T.untyped]).returns(T.attached_class) }
39
+ def self.from_hash(hash)
40
+ main_properties = hash.dup.select { |key, _| ATTRIBUTES.include?(key) }
41
+ main_properties['blog_post'] = main_properties.delete('blogPost')
42
+ main_properties.transform_keys! { |it| it.start_with?('@') ? it.sub('@', '').to_sym : it.to_sym }
43
+
44
+ additional_properties = hash.dup
45
+ ATTRIBUTES.each { |it| additional_properties.delete(it) }
46
+ additional_properties.transform_keys!(&:to_s)
47
+
48
+ new(**main_properties, additional_properties:)
49
+ end
50
+
51
+ sig { returns(T.nilable(T.any(LdAuthor, T::Array[LdAuthor]))) }
52
+ def parsed_author
53
+ return @_parsed_author if defined?(@_parsed_author)
54
+
55
+ @_parsed_author = T.let(parse_author, T.nilable(T.any(LdAuthor, T::Array[LdAuthor])))
56
+ end
57
+
58
+ private
59
+
60
+ sig { returns(T.nilable(T.any(LdAuthor, T::Array[LdAuthor]))) }
61
+ def parse_author
62
+ return nil if author.nil?
63
+
64
+ case author
65
+ when String, Hash
66
+ topical_author = T.cast(author, AuthorBaseType)
67
+ LdAuthor.from_hash(topical_author)
68
+ when Array
69
+ topical_author = T.cast(author, ArrayofAuthors)
70
+ topical_author.map { |it| LdAuthor.from_hash(it) }
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+ # typed: strict
3
+
4
+ module WebAuthor
5
+ class Page
6
+ extend T::Sig
7
+
8
+ sig { returns(String) }
9
+ attr_reader :url
10
+
11
+ sig { returns(T.nilable(Nokogiri::XML::Document)) }
12
+ attr_reader :page_content
13
+
14
+ sig { params(url: String).void }
15
+ def initialize(url:)
16
+ @url = T.let(url, String)
17
+ @page_content = T.let(nil, T.nilable(Nokogiri::XML::Document))
18
+ end
19
+
20
+ sig { returns(T.nilable(String)) }
21
+ def author
22
+ fetch_page_content unless page_content
23
+
24
+ STRATEGIES.each do |strategy_class|
25
+ author = strategy_class.new(T.must(page_content)).author
26
+ return author if author
27
+ end
28
+
29
+ nil
30
+ end
31
+
32
+ private
33
+
34
+ # The order of these strategies is important as it determines in case of conflicting data
35
+ # which strategy should be used first.
36
+ STRATEGIES = T.let([
37
+ WebAuthor::Author::Strategies::AuthorFromLdSchema,
38
+ WebAuthor::Author::Strategies::AuthorFromMeta
39
+ ].freeze, T::Array[T.class_of(WebAuthor::Author::Strategy)])
40
+ private_constant :STRATEGIES
41
+
42
+ sig { returns(T.nilable(Nokogiri::XML::Document)) }
43
+ def fetch_page_content
44
+ uri = URI.parse(url)
45
+ response = Net::HTTP.get_response(uri)
46
+
47
+ if response.is_a?(Net::HTTPSuccess)
48
+ @page_content = Nokogiri::HTML(response.body)
49
+ else
50
+ raise Error, "Failed to fetch page: #{response.code} #{response.message}"
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module WebAuthor
4
+ VERSION = '0.1.0'
5
+ end
data/lib/web_author.rb ADDED
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'zeitwerk'
4
+ require 'nokogiri'
5
+ require 'sorbet-runtime'
6
+ require 'net/http'
7
+ require 'uri'
8
+
9
+ loader = Zeitwerk::Loader.for_gem
10
+ loader.setup
11
+
12
+ module WebAuthor
13
+ class Error < StandardError; end
14
+ end
data/sig/webauthor.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Webauthor
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
data/sorbet/config ADDED
@@ -0,0 +1,4 @@
1
+ --dir
2
+ .
3
+ --ignore=/tmp/
4
+ --ignore=/vendor/bundle
@@ -0,0 +1 @@
1
+ **/*.rbi linguist-vendored=true