bsky-parser 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7fdc55d4d3f8e9870d3269b17fd625dae9531097fd23cd28baf2818369aa2e56
4
- data.tar.gz: 2ac103361580b7e635be487f4711175c7a4c48d2882d6bf53ebca63617b44112
3
+ metadata.gz: 9644188145dd0c8339bac95a664eb8571b4e0491ecb4a6caecca335e8cbeeee7
4
+ data.tar.gz: d79bf56d7329841971cc3e2d1d5149216603d3bdd863212a048e96835c49de2f
5
5
  SHA512:
6
- metadata.gz: 93856e7fbd30936e2174248771bd7311ac0199578a083ae19b5a389973a4e2edaaf4166fceb0419141362803c4fc2ea616a5d5e8b42e1235c4b17674f6d2fab3
7
- data.tar.gz: 967011def1d229208d152f14006f394341bed0797c21629dba8f0d5442e75b98b1e454f9f4a093c7f41348e120a5a598fb529af69f8ca25768f38bc4079341ee
6
+ metadata.gz: 378e2e528768da19d162f68ffdbf0f48e4fd3d1f4a47fab457e6a643707314f9b4422b4210d169004c3d59661a839bfd6a8b68d44e2eaa4e963c779e103a40e3
7
+ data.tar.gz: f33448b0ac7a7cb653e020ddda3c3a67ff3d1fa0a48133bd4449567398298b3b8d4f3783b3eb32816abbf28a36aee30c4da040839e3430a63eda7f95c5b9239b
data/.rubocop.yml CHANGED
@@ -1,5 +1,3 @@
1
- inherit_from: .rubocop_todo.yml
2
-
3
1
  AllCops:
4
2
  TargetRubyVersion: 3.3
5
3
  NewCops: enable
@@ -19,9 +17,13 @@ Style/FrozenStringLiteralComment:
19
17
 
20
18
  Metrics/MethodLength:
21
19
  Max: 20
20
+ Exclude:
21
+ - "test/**/*"
22
22
 
23
23
  Metrics/AbcSize:
24
24
  Max: 30
25
+ Exclude:
26
+ - 'test/**/*'
25
27
 
26
28
  Metrics/ClassLength:
27
29
  Enabled: true
data/CHANGELOG.md ADDED
@@ -0,0 +1,23 @@
1
+ ## [Unreleased]
2
+
3
+ ## [1.0.2] - 2025-02-25
4
+
5
+ ### Fixed
6
+
7
+ - Fix tag facet regex. Previously, it would match this entire string: `#hello!`. Now it correctly matches `#hello` without the `!`.
8
+ - Regex patterns for mentions, tags, and URL facets correctly match when they appear at the beginning of text without requiring a leading space. For example, `"#hello"` is now properly detected as a tag, while mid-word occurrences like `"hello#hello"` are still ignored.
9
+ - URL facet has been fixed to not match if it occurs mid-word. E.g. `hellohttps://example.com` no longer matches.
10
+ - Correctly handle indices to take into account of leading space with multiple matches.
11
+
12
+ ## [1.0.1] - 2025-02-19
13
+
14
+ ### Added
15
+ - More files from bundle gem generator. Originally, I followed the rubygem guide which listed less files.
16
+ - Added LICENSE
17
+
18
+ ### Fixed
19
+ - Fixed Rubocop linting rule errors.
20
+
21
+ ## [1.0.0] - 2025-02-19
22
+
23
+ - Initial Release.
data/README.md CHANGED
@@ -1,8 +1,5 @@
1
1
  # Bsky Parser
2
2
 
3
- > [!WARNING]
4
- > API is stable but development is still in progress. Aka no tests yet!
5
-
6
3
  Gem that will parse text content and generate Bluesky rich text facets.
7
4
 
8
5
  Facets supported:
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module BskyParser
4
+ module Facets
5
+ class BaseFacet
6
+ def self.process(content)
7
+ new(content).process
8
+ end
9
+
10
+ attr_reader :content
11
+
12
+ def initialize(content)
13
+ @content = content
14
+ end
15
+
16
+ def process
17
+ raise NotImplementedError, "#{self.class} has not implemented method '#{__method__}'"
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_facet"
4
+
5
+ module BskyParser
6
+ module Facets
7
+ class MarkdownLinkFacet < BaseFacet
8
+ # Override class method to return both modified content and facets
9
+ def self.process(content)
10
+ new(content).process
11
+ end
12
+
13
+ def process
14
+ facets = []
15
+ result_text = content.dup
16
+ links = find_markdown_links
17
+
18
+ links.each do |link|
19
+ start_pos = result_text.index(link[:match])
20
+ next unless start_pos
21
+
22
+ end_pos = start_pos + link[:match].length
23
+
24
+ # Replace markdown syntax with just the text
25
+ result_text[start_pos...end_pos] = link[:text]
26
+
27
+ facets << {
28
+ index: {
29
+ byteStart: start_pos,
30
+ byteEnd: start_pos + link[:text].length
31
+ },
32
+ features: [{
33
+ "$type": "app.bsky.richtext.facet#link",
34
+ uri: link[:link]
35
+ }]
36
+ }
37
+ end
38
+
39
+ [result_text, facets]
40
+ end
41
+
42
+ private
43
+
44
+ def url_pattern
45
+ # This url pattern is different to URL facet url patten because
46
+ # we don't want to mix named and numbered capture groups.
47
+ # Instead we convert the numbered to non-capturing groups `?:`
48
+ %r{
49
+ \[
50
+ (?<text>[^\]]+) # The link text inside square brackets
51
+ \]
52
+ \(
53
+ (?<url>
54
+ https?:// # http:// or https://
55
+ (?:www\.)? # Optional www.
56
+ [-a-zA-Z0-9@:%._\+~#=]{1,256} # Domain name
57
+ \.
58
+ [a-zA-Z0-9()]{1,6} # TLD
59
+ \b
60
+ (?:[-a-zA-Z0-9()@:%_\+.~#?&/=]* # URL path, params, etc.
61
+ [-a-zA-Z0-9@%_\+~#/=])?
62
+ )
63
+ \)
64
+ }x
65
+ end
66
+
67
+ def find_markdown_links
68
+ content.to_enum(:scan, url_pattern).map do
69
+ match = Regexp.last_match
70
+ {
71
+ text: match[:text],
72
+ link: match[:url],
73
+ match: match.to_s
74
+ }
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_facet"
4
+
5
+ module BskyParser
6
+ module Facets
7
+ class MentionFacet < BaseFacet
8
+ BASE_URL = "https://bsky.social"
9
+
10
+ def process
11
+ facets = []
12
+ # regex based on: https://atproto.com/specs/handle#handle-identifier-syntax
13
+ matches = content.to_enum(:scan, mention_pattern).map do
14
+ match = Regexp.last_match
15
+ start_offset = match[1]&.length || 0
16
+
17
+ {
18
+ handle: match[0],
19
+ indices: [match.begin(0) + start_offset, match.end(0)]
20
+ }
21
+ end
22
+
23
+ matches.each do |match|
24
+ handle = match[:handle].to_s.strip[1..] # Trim leading @
25
+ indices = match[:indices]
26
+ did = fetch_did(handle)
27
+ next if did.nil?
28
+
29
+ facets << build_facet(indices, did)
30
+ end
31
+ facets
32
+ end
33
+
34
+ private
35
+
36
+ def conn
37
+ @conn ||= Faraday.new(url: BASE_URL) do |f|
38
+ f.request :json
39
+ end
40
+ end
41
+
42
+ def fetch_did(handle)
43
+ resp = conn.get("/xrpc/com.atproto.identity.resolveHandle", { handle: handle })
44
+ JSON.parse(resp.body)["did"] if resp.success?
45
+ rescue Faraday::Error
46
+ # TODO: Introduce logging
47
+ nil
48
+ end
49
+
50
+ def mention_pattern
51
+ /(^|\s)(@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)/
52
+ end
53
+
54
+ def build_facet(indices, handle_did)
55
+ {
56
+ index: {
57
+ byteStart: indices[0],
58
+ byteEnd: indices[1]
59
+ },
60
+ features: [{
61
+ "$type": "app.bsky.richtext.facet#mention",
62
+ did: handle_did
63
+ }]
64
+ }
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_facet"
4
+
5
+ module BskyParser
6
+ module Facets
7
+ class TagFacet < BaseFacet
8
+ def process
9
+ facets = []
10
+ tag_pattern = /(^|\s)#[\w-]+/
11
+ matches = content.to_enum(:scan, tag_pattern).map do
12
+ match = Regexp.last_match
13
+ # If there's a space before the hashtag (match[1] contains a space),
14
+ # adjust the start position by adding 1
15
+ start_offset = match[1]&.length || 0
16
+
17
+ {
18
+ tag: match[0],
19
+ indices: [match.begin(0) + start_offset, match.end(0)]
20
+ }
21
+ end
22
+
23
+ matches.each do |match|
24
+ tag = match[:tag].to_s.lstrip[1..] # Trim leading space and hashtag
25
+ indices = match[:indices]
26
+ facets << build_facet(indices, tag)
27
+ end
28
+
29
+ facets
30
+ end
31
+
32
+ private
33
+
34
+ def build_facet(indices, tag)
35
+ {
36
+ index: {
37
+ byteStart: indices[0],
38
+ byteEnd: indices[1]
39
+ },
40
+ features: [{
41
+ "$type": "app.bsky.richtext.facet#tag",
42
+ tag: tag
43
+ }]
44
+ }
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "base_facet"
4
+
5
+ module BskyParser
6
+ module Facets
7
+ class URLFacet < BaseFacet
8
+ def process
9
+ facets = []
10
+
11
+ matches = content.to_enum(:scan, url_pattern).map do
12
+ match = Regexp.last_match
13
+ # Handles multiple urls
14
+ start_offset = match[1]&.length || 0
15
+
16
+ {
17
+ url: match[0],
18
+ indices: [match.begin(0) + start_offset, match.end(0)]
19
+ }
20
+ end
21
+
22
+ matches.each do |match|
23
+ url = match[:url].to_s.lstrip
24
+ indices = match[:indices]
25
+ facets << build_facet(indices, url)
26
+ end
27
+ facets
28
+ end
29
+
30
+ private
31
+
32
+ def url_pattern
33
+ # URI::RFC2396_PARSER.make_regexp has a complex regex with multiple capture groups
34
+ # Instead, use the URL pattern from https://docs.bsky.app/docs/advanced-guides/post-richtext
35
+ %r{
36
+ (^|\s)
37
+ (https?://
38
+ (www\.)?
39
+ [-a-zA-Z0-9@:%._\+~#=]{1,256}
40
+ \.
41
+ [a-zA-Z0-9()]{1,6}\b
42
+ ([-a-zA-Z0-9()@:%_\+.~#?&/=]*
43
+ [-a-zA-Z0-9@%_\+~#/=])?)
44
+ }x
45
+ end
46
+
47
+ def build_facet(indices, url)
48
+ {
49
+ index: {
50
+ byteStart: indices[0],
51
+ byteEnd: indices[1]
52
+ },
53
+ features: [{
54
+ "$type": "app.bsky.richtext.facet#link",
55
+ uri: url
56
+ }]
57
+ }
58
+ end
59
+ end
60
+ end
61
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module BskyParser
4
- VERSION = "1.0.1"
4
+ VERSION = "1.0.2"
5
5
  end
data/lib/bsky_parser.rb CHANGED
@@ -1,145 +1,25 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative "bsky_parser/version"
4
+ require_relative "bsky_parser/facets/markdown_link_facet"
5
+ require_relative "bsky_parser/facets/tag_facet"
6
+ require_relative "bsky_parser/facets/url_facet"
7
+ require_relative "bsky_parser/facets/mention_facet"
4
8
 
5
9
  require "faraday"
6
10
 
7
11
  module BskyParser
8
12
  class << self
9
- BASE_URL = "https://bsky.social"
10
-
11
13
  def parse(content)
12
- parsed_content, mkdown_facets = process_markdown_links(content)
14
+ parsed_content, mkdown_facets = Facets::MarkdownLinkFacet.process(content)
13
15
 
14
- facets = mkdown_facets + tag_facets(parsed_content) + mention_facets(parsed_content) + url_facets(parsed_content)
16
+ facets =
17
+ mkdown_facets +
18
+ Facets::TagFacet.process(parsed_content) +
19
+ Facets::MentionFacet.process(parsed_content) +
20
+ Facets::URLFacet.process(parsed_content)
15
21
 
16
22
  [parsed_content, facets]
17
23
  end
18
-
19
- private
20
-
21
- def conn
22
- @conn ||= Faraday.new(url: BASE_URL) do |f|
23
- f.request :json
24
- end
25
- end
26
-
27
- def tag_facets(content)
28
- facets = []
29
- tag_pattern = /#\S+/
30
- matches = content.to_enum(:scan, tag_pattern).map do
31
- { tag: Regexp.last_match, indices: Regexp.last_match.offset(0) }
32
- end
33
- matches.each do |match|
34
- tag = match[:tag].to_s[1..] # Trim leading hashtag
35
- indices = match[:indices]
36
- facets << {
37
- index: {
38
- byteStart: indices[0],
39
- byteEnd: indices[1]
40
- },
41
- features: [{
42
- "$type": "app.bsky.richtext.facet#tag",
43
- tag: tag
44
- }]
45
- }
46
- end
47
- facets
48
- end
49
-
50
- def mention_facets(content)
51
- facets = []
52
- # regex based on: https://atproto.com/specs/handle#handle-identifier-syntax
53
- mention_pattern = /[$|\W](@([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)/
54
- matches = content.to_enum(:scan, mention_pattern).map do
55
- { handle: Regexp.last_match, indices: Regexp.last_match.offset(0) }
56
- end
57
- matches.each do |match|
58
- handle = match[:handle].to_s.strip[1..] # Trim leading @
59
- indices = match[:indices]
60
- resp = conn.get("/xrpc/com.atproto.identity.resolveHandle", { handle: handle })
61
- handle_did = JSON.parse(resp.body)["did"]
62
- facets << {
63
- index: {
64
- byteStart: indices[0],
65
- byteEnd: indices[1]
66
- },
67
- features: [{
68
- "$type": "app.bsky.richtext.facet#mention",
69
- did: handle_did
70
- }]
71
- }
72
- end
73
- facets
74
- end
75
-
76
- def mkdown_links(content)
77
- url_pattern = %r{\[(?<text>[^\]]+)\]\((?<url>https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&/=]*[-a-zA-Z0-9@%_\+~#/=])?)\)}
78
- matches = content.to_enum(:scan, url_pattern).map { Regexp.last_match }
79
-
80
- mkdown_links = []
81
- matches.each do |match|
82
- mkdown_links << {
83
- text: match[:text],
84
- link: match[:url],
85
- match: match.to_s
86
- }
87
- end
88
- mkdown_links
89
- end
90
-
91
- def process_markdown_links(content)
92
- facets = []
93
- result_text = content.dup
94
-
95
- links = mkdown_links(content)
96
-
97
- links.reverse_each do |link|
98
- start_pos = result_text.index(link[:match])
99
-
100
- next unless start_pos
101
-
102
- end_pos = start_pos + link[:match].length
103
-
104
- facets << {
105
- index: {
106
- byteStart: start_pos,
107
- byteEnd: start_pos + link[:text].length
108
- },
109
- features: [{
110
- "$type": "app.bsky.richtext.facet#link",
111
- uri: link[:link]
112
- }]
113
- }
114
-
115
- result_text[start_pos...end_pos] = link[:text]
116
- end
117
-
118
- [result_text, facets]
119
- end
120
-
121
- def url_facets(content)
122
- facets = []
123
- # URL pattern from https://docs.bsky.app/docs/advanced-guides/post-richtext
124
- url_pattern = %r{([$|\W])(https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&/=]*[-a-zA-Z0-9@%_\+~#/=])?)}
125
- matches = content.to_enum(:scan, url_pattern).map do
126
- { url: Regexp.last_match, indices: Regexp.last_match.offset(0) }
127
- end
128
- matches.each do |match|
129
- url = match[:url].to_s[1..]
130
- indices = match[:indices]
131
- facets << {
132
- index: {
133
- byteStart: indices[0],
134
- byteEnd: indices[1]
135
- },
136
- features: [{
137
- "$type": "app.bsky.richtext.facet#link",
138
- uri: url
139
- }]
140
- }
141
- end
142
- facets
143
- end
144
24
  end
145
25
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bsky-parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Yeong
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-02-19 00:00:00.000000000 Z
11
+ date: 2025-02-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: faraday
@@ -32,12 +32,17 @@ extra_rdoc_files: []
32
32
  files:
33
33
  - ".devcontainer/devcontainer.json"
34
34
  - ".rubocop.yml"
35
- - ".rubocop_todo.yml"
35
+ - CHANGELOG.md
36
36
  - CODE_OF_CONDUCT.md
37
37
  - LICENSE
38
38
  - README.md
39
39
  - Rakefile
40
40
  - lib/bsky_parser.rb
41
+ - lib/bsky_parser/facets/base_facet.rb
42
+ - lib/bsky_parser/facets/markdown_link_facet.rb
43
+ - lib/bsky_parser/facets/mention_facet.rb
44
+ - lib/bsky_parser/facets/tag_facet.rb
45
+ - lib/bsky_parser/facets/url_facet.rb
41
46
  - lib/bsky_parser/version.rb
42
47
  - sig/bsky_parser.rbs
43
48
  homepage: https://github.com/jonathanyeong/bsky-parser
data/.rubocop_todo.yml DELETED
@@ -1,31 +0,0 @@
1
- # This configuration was generated by
2
- # `rubocop --auto-gen-config`
3
- # on 2025-02-19 04:07:09 UTC using RuboCop version 1.72.2.
4
- # The point is for the user to remove these configuration records
5
- # one by one as the offenses are removed from the code base.
6
- # Note that changes in the inspected code, or installation of new
7
- # versions of RuboCop, may require this file to be generated again.
8
-
9
- # Offense count: 1
10
- Lint/MixedRegexpCaptureTypes:
11
- Exclude:
12
- - 'lib/bsky_parser.rb'
13
-
14
- # Offense count: 1
15
- # Configuration parameters: CountComments, Max, CountAsOne, AllowedMethods, AllowedPatterns.
16
- Metrics/MethodLength:
17
- Exclude:
18
- - 'lib/bsky_parser.rb'
19
-
20
- # Offense count: 1
21
- # This cop supports unsafe autocorrection (--autocorrect-all).
22
- Style/MapIntoArray:
23
- Exclude:
24
- - 'lib/bsky_parser.rb'
25
-
26
- # Offense count: 3
27
- # This cop supports safe autocorrection (--autocorrect).
28
- # Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, AllowedPatterns, SplitStrings.
29
- # URISchemes: http, https
30
- Layout/LineLength:
31
- Max: 175