RubyGems - reddit_post_to_markdown - Versions diffs - 0.1.0 - Mend

reddit_post_to_markdown 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +7 -0
data/lib/reddit_post_to_markdown/errors.rb +13 -0
data/lib/reddit_post_to_markdown/post_renderer.rb +334 -0
data/lib/reddit_post_to_markdown/reddit_client.rb +42 -0
data/lib/reddit_post_to_markdown/url_validator.rb +47 -0
data/lib/reddit_post_to_markdown/version.rb +3 -0
data/lib/reddit_post_to_markdown.rb +74 -0
data/reddit_post_to_markdown.gemspec +19 -0
metadata +87 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: fbcefef7bcf13e1fc6d417be5e2a5adf397f69931fbacae27855c0fc0d34e66e
+  data.tar.gz: 4cecd3cb2af8878a4766a12b7b33fc6eb7a47bcb957681f4ce7f83248c76a384
+SHA512:
+  metadata.gz: 3e5e54ea2bf6f039f889ee2d19020b7e9d7621f993722b0d1850969d48f67adcfe3ae1bcd26c15548e0512a6d40a3c56a342b9e206e368327925e9c60380ee61
+  data.tar.gz: fbfa20da94b37b834cfbcf88b6ae4f625aba540a0abe9ec0738e2fde074f0d9230545c9889ab93ddd7e555de2ac88457e6689f0c0f12189ab369b07353b69220

data/lib/reddit_post_to_markdown/errors.rb ADDED Viewed

@@ -0,0 +1,13 @@
+module RedditPostToMarkdown
+  # Raised when the given URL does not match a Reddit post URL pattern.
+  # This includes subreddit listings, user profiles, search results, and
+  # any URL that is not a direct link to a single post.
+  class NotAPostError < StandardError; end
+  # Raised when the HTTP request to Reddit fails with a non-2xx status code.
+  class FetchError < StandardError; end
+  # Raised when Reddit returns a response that does not have the expected
+  # two-element JSON array structure of a post listing.
+  class InvalidResponseError < StandardError; end
+end

data/lib/reddit_post_to_markdown/post_renderer.rb ADDED Viewed

@@ -0,0 +1,334 @@
+require "time"
+module RedditPostToMarkdown
+  # Converts Reddit post data and its comments into a Markdown string.
+  #
+  # The output format matches the {https://github.com/chauduyphanvu/reddit-markdown
+  # reddit-markdown} tool: post header, title, selftext, reply count, and a
+  # depth-indented comment tree.
+  class PostRenderer
+    # Replacement text used when a comment matches a filter and no custom
+    # +:message+ is provided in the filters hash.
+    DEFAULT_FILTERED_MESSAGE = "REMOVED DUE TO CUSTOM FILTER(S)"
+    # Renders a Reddit post and its comments as a Markdown string.
+    #
+    # This is the primary entry point for the class. It instantiates a renderer
+    # and calls {#render}.
+    #
+    # @param post_data [Hash] the +data+ object from Reddit's post listing JSON,
+    #   containing keys such as +"title"+, +"author"+, +"selftext"+, +"ups"+,
+    #   +"locked"+, +"created_utc"+, and +"subreddit_name_prefixed"+
+    # @param replies_data [Array<Hash>] the +children+ array from Reddit's
+    #   comment listing JSON; each element represents a top-level comment
+    # @param filters [Hash] optional comment filters (see {RedditPostToMarkdown.convert}
+    #   for full key documentation)
+    # @return [String] the fully rendered Markdown
+    def self.render(post_data, replies_data, filters: {})
+      new(post_data, replies_data, filters).render
+    end
+    # @param post_data [Hash] Reddit post data hash (see {.render})
+    # @param replies_data [Array<Hash>] top-level comment objects (see {.render})
+    # @param filters [Hash] optional comment filters (see {.render})
+    def initialize(post_data, replies_data, filters = {})
+      @post_data    = post_data
+      @replies_data = replies_data
+      @filters      = filters || {}
+    end
+    # Renders the post and all its comments as a single Markdown string.
+    #
+    # Sections in order:
+    # 1. Post header (subreddit, author, upvotes, timestamp)
+    # 2. Post title as an H2
+    # 3. Link back to the original post
+    # 4. Lock notice (if the thread is locked)
+    # 5. Post body / selftext as a block-quote (if present)
+    # 6. Total reply count
+    # 7. Horizontal rule
+    # 8. Comment tree, depth-indented with tab characters
+    #
+    # @return [String]
+    def render
+      lines = []
+      # Post header
+      lines << "#{header_line}"
+      lines << "## #{post_title}"
+      lines << "Original post: [#{post_url}](#{post_url})"
+      lines << lock_message if post_locked?
+      # Selftext
+      if post_selftext && !post_selftext.strip.empty?
+        decoded = decode_selftext(post_selftext)
+        lines << "> #{decoded.gsub("\n", "\n> ")}"
+      end
+      image_urls = post_image_urls()
+      if image_urls.size > 0
+        lines << "### Images"
+        image_urls.each do |url|
+          lines << "![no alt text](#{url})"
+        end
+        lines << ""
+      end
+      # Reply count + separator
+      total = count_all_replies
+      lines << "💬 ~ #{total} replies"
+      lines << "---\n"
+      # Top-level comments
+      @replies_data.each do |reply_obj|
+        render_top_level_reply(reply_obj, lines)
+      end
+      lines.join("\n")
+    end
+    private
+    def post_title
+      @post_data.fetch("title", "Untitled")
+    end
+    def post_author
+      @post_data.fetch("author", "[unknown]")
+    end
+    def post_subreddit
+      @post_data.fetch("subreddit_name_prefixed", "")
+    end
+    def post_ups
+      @post_data.fetch("ups", 0)
+    end
+    def post_locked?
+      @post_data.fetch("locked", false)
+    end
+    def post_selftext
+      @post_data.fetch("selftext", "")
+    end
+    def post_url
+      @post_data.fetch("url", "")
+    end
+    def post_created_utc
+      @post_data["created_utc"]
+    end
+    def post_image_urls
+      image_urls = []
+      media_metadata = @post_data["media_metadata"]
+      # a hash of hashes with some sort of hashed keys we don't care about
+      return image_urls unless media_metadata
+      media_metadata.each do |_hashed_key, metadata_hash|
+        next unless metadata_hash["e"] == "Image" || metadata_hash["e"] == "AnimatedImage"
+        src = metadata_hash["s"]
+        next unless src
+        url = src["u"] || src["gif"] || src["mp4"]
+        next unless url
+        # Reddit JSON HTML-encodes query strings; strip the signed params and
+        # rewrite preview.redd.it → i.redd.it so the URL serves the image directly
+        # instead of redirecting to an HTML wrapper page.
+        url = url.gsub("&amp;", "&").split("?")[0].sub("/preview.", "/i.")
+        image_urls << url
+      end
+      image_urls
+    end
+    def header_line
+      upvotes = format_upvotes(post_ups)
+      ts      = format_timestamp(post_created_utc)
+      ts_str  = ts ? "_( #{ts} )_" : ""
+      "**#{post_subreddit}** | Posted by u/#{post_author} #{upvotes} #{ts_str}"
+    end
+    def lock_message
+      "---\n\n>🔒 **This thread has been locked by the moderators of #{post_subreddit}**.\n  New comments cannot be posted\n\n"
+    end
+    def format_upvotes(ups)
+      return "" if ups.nil?
+      ups >= 1000 ? "⬆️ #{ups / 1000}k" : "⬆️ #{ups}"
+    end
+    def format_timestamp(utc)
+      return nil unless utc && utc != 0
+      Time.at(utc.to_i).utc.strftime("%Y-%m-%d %H:%M:%S")
+    rescue
+      nil
+    end
+    def decode_selftext(text)
+      text
+        .gsub("&amp;", "&")
+        .gsub("&lt;", "<")
+        .gsub("&gt;", ">")
+        .gsub("&quot;", '"')
+    end
+    def apply_filter(author, body, ups)
+      return body if @filters.nil? || @filters.empty?
+      message    = @filters[:message]     || DEFAULT_FILTERED_MESSAGE
+      keywords   = Array(@filters[:keywords])
+      authors    = Array(@filters[:authors])
+      min_ups    = @filters[:min_upvotes] || 0
+      regexes    = Array(@filters[:regexes])
+      keywords.each do |kw|
+        return message if body.downcase.include?(kw.to_s.downcase)
+      end
+      return message if authors.include?(author)
+      return message if ups < min_ups
+      regexes.each do |regex|
+        return message if regex.match?(body)
+      end
+      body
+    end
+    def decode_body(text)
+      text
+        .gsub("&gt;", ">")
+        .gsub("\r", "")
+    end
+    def decode_child_body(text)
+      text
+        .gsub("&gt;", ">")
+        .gsub("&amp;#32;", " ")
+        .gsub("^^[", "[")
+        .gsub("^^(", "(")
+    end
+    def linkify_mentions(text)
+      text.gsub(%r{u/(\w+)}) { "[u/#{$1}](https://www.reddit.com/user/#{$1})" }
+    end
+    def author_link(author)
+      return author if author.nil? || author == "[deleted]" || author.empty?
+      "[#{author}](https://www.reddit.com/user/#{author})"
+    end
+    def author_field(author)
+      field = author_link(author)
+      field += " (OP)" if author == post_author && author != "[deleted]" && !author.empty?
+      field
+    end
+    def count_all_replies
+      total = @replies_data.length
+      @replies_data.each do |reply_obj|
+        total += get_replies(reply_obj).length
+      end
+      total
+    end
+    # Recursively collects all child replies into a flat ordered hash.
+    #
+    # Traverses the Reddit comment tree depth-first and returns every
+    # descendant comment keyed by its Reddit comment ID. Comments with empty
+    # or whitespace-only bodies are skipped. Comments deeper than +max_depth+
+    # are skipped unless +max_depth+ is +-1+ (unlimited).
+    #
+    # @param reply_data [Hash] a Reddit comment object containing a nested
+    #   +"replies"+ structure
+    # @param max_depth [Integer] maximum comment depth to collect;
+    #   +-1+ means no limit
+    # @param collected [Hash] accumulator used during recursion; callers
+    #   should omit this argument
+    # @return [Hash{String => Hash}] a hash of
+    #   +id => { depth: Integer, child_reply: Hash }+ in depth-first order
+    def get_replies(reply_data, max_depth: -1, collected: {})
+      replies_obj = reply_data.dig("data", "replies")
+      return collected unless replies_obj.is_a?(Hash)
+      children = replies_obj.dig("data", "children") || []
+      children.each do |child|
+        child_data  = child.fetch("data", {})
+        child_id    = child_data["id"]
+        child_depth = child_data.fetch("depth", 0)
+        child_body  = child_data.fetch("body", "")
+        next if max_depth != -1 && child_depth > max_depth
+        next if child_body.strip.empty?
+        collected[child_id] = { depth: child_depth, child_reply: child }
+        get_replies(child, max_depth: max_depth, collected: collected)
+      end
+      collected
+    end
+    def render_top_level_reply(reply_obj, lines)
+      data   = reply_obj.fetch("data", {})
+      author = data.fetch("author", "")
+      return if author.empty?
+      return if author == "AutoModerator"
+      ups      = data.fetch("ups", 0)
+      upvotes  = format_upvotes(ups)
+      ts       = format_timestamp(data["created_utc"])
+      ts_str   = ts ? "_( #{ts} )_" : ""
+      af       = author_field(author)
+      lines << "* **#{af}** #{upvotes} #{ts_str}\n\n"
+      body = data.fetch("body", "")
+      return if body.strip.empty?
+      if body == "[deleted]"
+        lines << "\tComment deleted by user\n\n"
+      else
+        filtered  = apply_filter(author, body, ups)
+        formatted = decode_body(filtered)
+        formatted = linkify_mentions(formatted)
+        formatted = formatted.gsub("\n", "\n\t")
+        lines << "\t#{formatted}\n\n"
+      end
+      # Nested replies
+      child_map = get_replies(reply_obj)
+      child_map.each_value do |info|
+        render_child_reply(info, lines)
+      end
+    end
+    def render_child_reply(info, lines)
+      cdepth      = info[:depth]
+      child_data  = info[:child_reply].fetch("data", {})
+      author      = child_data.fetch("author", "")
+      ups         = child_data.fetch("ups", 0)
+      body        = child_data.fetch("body", "")
+      upvotes  = format_upvotes(ups)
+      ts       = format_timestamp(child_data["created_utc"])
+      ts_str   = ts ? "_( #{ts} )_" : ""
+      af       = author_field(author)
+      indent   = "\t" * cdepth
+      lines << "#{indent}* **#{af}** #{upvotes} #{ts_str}\n\n"
+      return if body.strip.empty?
+      if body == "[deleted]"
+        lines << "#{indent}\tComment deleted by user\n\n"
+      else
+        filtered  = apply_filter(author, body, ups)
+        formatted = decode_child_body(filtered)
+        formatted = linkify_mentions(formatted)
+        formatted = formatted.gsub("\n", "\n#{indent}\t")
+        lines << "#{indent}\t#{formatted}\n\n"
+      end
+    end
+  end
+end

data/lib/reddit_post_to_markdown/reddit_client.rb ADDED Viewed

@@ -0,0 +1,42 @@
+require "httparty"
+require_relative "version"
+module RedditPostToMarkdown
+  # Fetches Reddit post JSON via the public Reddit API.
+  #
+  # Requests are made without authentication using Reddit's +.json+ endpoint,
+  # which is available for any public post.
+  class RedditClient
+    include HTTParty
+    USER_AGENT = "RedditPostToMarkdown/#{VERSION} (Safe Download Bot)"
+    # Downloads the JSON data for a Reddit post URL.
+    #
+    # Appends +.json+ to +url+ (unless already present) and issues a GET
+    # request. Reddit returns a two-element array: the first element contains
+    # the post data and the second contains the top-level comments.
+    #
+    # @param url [String] a cleaned Reddit post URL (no trailing slash,
+    #   no query parameters)
+    # @return [Array] the parsed two-element JSON response from Reddit
+    # @raise [FetchError] if the server returns a non-2xx HTTP status
+    # @raise [InvalidResponseError] if the parsed response is not a two-element
+    #   Array
+    def fetch_post(url)
+      json_url = url.end_with?(".json") ? url : "#{url}.json"
+      response = self.class.get(json_url, headers: { "User-Agent" => USER_AGENT })
+      raise FetchError, "HTTP #{response.code} fetching #{url}" unless response.success?
+      data = response.parsed_response
+      unless data.is_a?(Array) && data.length >= 2
+        raise InvalidResponseError, "Expected a 2-element JSON array from #{url}"
+      end
+      data
+    end
+  end
+end

data/lib/reddit_post_to_markdown/url_validator.rb ADDED Viewed

@@ -0,0 +1,47 @@
+module RedditPostToMarkdown
+  # Validates and normalises Reddit post URLs.
+  class UrlValidator
+    PATTERNS = [
+      %r{\Ahttps://(?:www\.)?reddit\.com/r/[^/]+/comments/[a-z0-9]+/},
+      %r{\Ahttps://(?:www\.)?reddit\.com/[^/]+/comments/[a-z0-9]+/},
+      %r{\Ahttps://(?:old\.)?reddit\.com/r/[^/]+/comments/[a-z0-9]+/},
+      %r{\Ahttps://redd\.it/[a-z0-9]+}
+    ].freeze
+    # Returns +true+ if +url+ looks like a direct Reddit post URL.
+    #
+    # A valid post URL must use HTTPS and match one of the following forms:
+    # - +https://www.reddit.com/r/<sub>/comments/<id>/+
+    # - +https://reddit.com/r/<sub>/comments/<id>/+
+    # - +https://old.reddit.com/r/<sub>/comments/<id>/+
+    # - +https://redd.it/<id>+
+    #
+    # Subreddit listings, user profiles, search pages, and similar URLs return
+    # +false+.
+    #
+    # @param url [String, nil] the URL to check
+    # @return [Boolean]
+    def self.valid_post_url?(url)
+      return false if url.nil? || url.empty?
+      return false unless url.start_with?("https://")
+      PATTERNS.any? { |pattern| url.match?(pattern) }
+    end
+    # Strips common tracking parameters and the trailing slash from a Reddit URL.
+    #
+    # Removes query strings beginning with +?utm_source+, +?ref=+, or
+    # +?context=+, then strips any trailing slash. Leading and trailing
+    # whitespace is also removed.
+    #
+    # @param url [String] the URL to clean
+    # @return [String] the cleaned URL
+    def self.clean_url(url)
+      url = url.to_s.strip
+      url = url.split("?utm_source").first
+      url = url.split("?ref=").first
+      url = url.split("?context=").first
+      url.chomp("/")
+    end
+  end
+end

data/lib/reddit_post_to_markdown/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module RedditPostToMarkdown
+  VERSION = "0.1.0"
+end

data/lib/reddit_post_to_markdown.rb ADDED Viewed

@@ -0,0 +1,74 @@
+require_relative "reddit_post_to_markdown/version"
+require_relative "reddit_post_to_markdown/errors"
+require_relative "reddit_post_to_markdown/url_validator"
+require_relative "reddit_post_to_markdown/reddit_client"
+require_relative "reddit_post_to_markdown/post_renderer"
+# Top-level namespace for the reddit_post_to_markdown gem.
+module RedditPostToMarkdown
+  # Downloads a public Reddit post and returns it as a Markdown string.
+  #
+  # The URL must point directly to a single post. Subreddit listings, user
+  # profiles, search pages, and similar URLs will raise {NotAPostError}.
+  # Posts that require authentication (private subreddits, age-gated content)
+  # are not accessible.
+  #
+  # @example Basic usage
+  #   markdown = RedditPostToMarkdown.convert(
+  #     "https://www.reddit.com/r/ruby/comments/abc123/some_title/"
+  #   )
+  #
+  # @example Without comments
+  #   markdown = RedditPostToMarkdown.convert(url, include_comments: false)
+  #
+  # @example With comment filters
+  #   markdown = RedditPostToMarkdown.convert(
+  #     url,
+  #     filters: {
+  #       keywords:    ["spam"],
+  #       authors:     ["AutoModerator"],
+  #       min_upvotes: 5,
+  #       regexes:     [/buy now/i],
+  #       message:     "[ removed ]"
+  #     }
+  #   )
+  #
+  # @param url [String] the URL of a public Reddit post
+  # @param include_comments [Boolean] when +false+, omits all comments and
+  #   renders only the post header, title, body, and a reply count of 0.
+  #   Defaults to +true+.
+  # @param filters [Hash] optional hash to suppress comments matching any
+  #   criterion. Filters are evaluated in the order listed below; the first
+  #   match replaces the comment body with +:message+. All keys are optional.
+  # @option filters [Array<String>] :keywords case-insensitive substrings;
+  #   any comment whose body contains one of these strings is replaced
+  # @option filters [Array<String>] :authors usernames (exact, case-sensitive
+  #   match) whose comments are replaced regardless of content
+  # @option filters [Integer] :min_upvotes comments with fewer upvotes than
+  #   this value are replaced
+  # @option filters [Array<Regexp>] :regexes patterns matched against the
+  #   comment body; a match causes the comment to be replaced
+  # @option filters [String] :message the replacement text used when any
+  #   filter matches (default: +"REMOVED DUE TO CUSTOM FILTER(S)"+)
+  # @return [String] the post and its comments rendered as Markdown
+  # @raise [NotAPostError] if +url+ does not point to a Reddit post
+  # @raise [FetchError] if the HTTP request to Reddit fails
+  # @raise [InvalidResponseError] if Reddit returns an unexpected JSON structure
+  def self.convert(url, filters: {}, include_comments: true)
+    clean = UrlValidator.clean_url(url)
+    unless UrlValidator.valid_post_url?(clean)
+      raise NotAPostError, "Not a Reddit post URL: #{url}"
+    end
+    data = RedditClient.new.fetch_post(clean)
+    post_info = data.dig(0, "data", "children")
+    raise InvalidResponseError, "No post data found in response" if post_info.nil? || post_info.empty?
+    post_data    = post_info[0].fetch("data", {})
+    replies_data = include_comments ? (data.dig(1, "data", "children") || []) : []
+    PostRenderer.render(post_data, replies_data, filters: filters)
+  end
+end

data/reddit_post_to_markdown.gemspec ADDED Viewed

@@ -0,0 +1,19 @@
+require_relative "lib/reddit_post_to_markdown/version"
+Gem::Specification.new do |spec|
+  spec.name        = "reddit_post_to_markdown"
+  spec.version     = RedditPostToMarkdown::VERSION
+  spec.authors     = ["masukomi"]
+  spec.summary     = "Download a public Reddit post and convert it to Markdown"
+  spec.description = "Takes the URL of a public Reddit post, downloads the post and its comments via the Reddit JSON API, and returns the content as a Markdown string."
+  spec.license     = "MIT"
+  spec.required_ruby_version = ">= 2.7"
+  spec.files = Dir["lib/**/*.rb", "reddit_post_to_markdown.gemspec"]
+  spec.add_dependency "httparty", "~> 0.22"
+  spec.add_development_dependency "rspec",   "~> 3.13"
+  spec.add_development_dependency "webmock", "~> 3.26"
+end

metadata ADDED Viewed

@@ -0,0 +1,87 @@
+--- !ruby/object:Gem::Specification
+name: reddit_post_to_markdown
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- masukomi
+bindir: bin
+cert_chain: []
+date: 1980-01-02 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: httparty
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.22'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.22'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.13'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.13'
+- !ruby/object:Gem::Dependency
+  name: webmock
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.26'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.26'
+description: Takes the URL of a public Reddit post, downloads the post and its comments
+  via the Reddit JSON API, and returns the content as a Markdown string.
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/reddit_post_to_markdown.rb
+- lib/reddit_post_to_markdown/errors.rb
+- lib/reddit_post_to_markdown/post_renderer.rb
+- lib/reddit_post_to_markdown/reddit_client.rb
+- lib/reddit_post_to_markdown/url_validator.rb
+- lib/reddit_post_to_markdown/version.rb
+- reddit_post_to_markdown.gemspec
+licenses:
+- MIT
+metadata: {}
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '2.7'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 4.0.7
+specification_version: 4
+summary: Download a public Reddit post and convert it to Markdown
+test_files: []