mddir 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,213 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "http-cookie"
4
+ require "httpx"
5
+ require "nokogiri"
6
+ require "readability"
7
+ require "reverse_markdown"
8
+ require "yaml"
9
+
10
+ module Mddir
11
+ class Fetcher # rubocop:disable Metrics/ClassLength
12
+ CONNECT_TIMEOUT = 15
13
+ READ_TIMEOUT = 30
14
+
15
+ READABILITY_TAGS = %w[
16
+ div p span
17
+ h1 h2 h3 h4 h5 h6
18
+ pre code
19
+ ul ol li
20
+ table thead tbody tfoot tr th td
21
+ blockquote
22
+ a img br hr
23
+ strong em b i u s del sub sup
24
+ dl dt dd
25
+ figure figcaption
26
+ details summary
27
+ ].freeze
28
+
29
+ READABILITY_ATTRIBUTES = %w[href src alt title lang class id style].freeze
30
+
31
+ def initialize(config, cookies_path: nil)
32
+ @config = config
33
+ @cookie_jar = load_cookies(cookies_path)
34
+ @client = build_client
35
+ end
36
+
37
+ def fetch(url)
38
+ response = request(url)
39
+ content_type = response.headers["content-type"].to_s
40
+
41
+ if content_type.include?("text/markdown")
42
+ process_markdown_response(url, response)
43
+ else
44
+ process_html_response(url, response)
45
+ end
46
+ end
47
+
48
+ private
49
+
50
+ def load_cookies(path)
51
+ return nil unless path && File.exist?(path)
52
+
53
+ jar = HTTP::CookieJar.new
54
+ jar.load(path, format: :cookiestxt, session: true)
55
+ jar
56
+ end
57
+
58
+ def build_client
59
+ HTTPX.plugin(:follow_redirects)
60
+ .with(
61
+ headers: {
62
+ "accept" => "text/markdown, text/html",
63
+ "user-agent" => @config.user_agent
64
+ },
65
+ timeout: { connect_timeout: CONNECT_TIMEOUT, read_timeout: READ_TIMEOUT }
66
+ )
67
+ end
68
+
69
+ def request(url)
70
+ headers = cookie_headers(url)
71
+ response = @client.get(url, headers: headers)
72
+ raise FetchError, response.error.message if response.is_a?(HTTPX::ErrorResponse)
73
+
74
+ response
75
+ end
76
+
77
+ def cookie_headers(url)
78
+ return {} unless @cookie_jar
79
+
80
+ uri = URI.parse(url)
81
+ cookie_value = HTTP::Cookie.cookie_value(@cookie_jar.cookies(uri))
82
+ cookie_value.empty? ? {} : { "cookie" => cookie_value }
83
+ end
84
+
85
+ def normalize_encoding(body, content_type)
86
+ body = body.dup
87
+ charset = content_type&.match(/charset=([^\s;]+)/i)&.captures&.first # rubocop:disable Style/SafeNavigationChainLength
88
+ body.force_encoding(charset || "UTF-8")
89
+ body.encode("UTF-8", invalid: :replace, undef: :replace)
90
+ end
91
+
92
+ def process_markdown_response(url, response)
93
+ body = normalize_encoding(response.body.to_s, response.headers["content-type"])
94
+ frontmatter, content = parse_frontmatter(body)
95
+ token_count, token_estimated = resolve_token_count(content, response.headers["x-markdown-tokens"])
96
+
97
+ Entry.new(
98
+ url:,
99
+ title: frontmatter["title"].to_s,
100
+ description: frontmatter["description"].to_s,
101
+ markdown: body,
102
+ conversion: "cloudflare",
103
+ token_count:,
104
+ token_estimated:
105
+ )
106
+ end
107
+
108
+ def parse_frontmatter(body)
109
+ if body.start_with?("---")
110
+ parts = body.split("---", 3)
111
+ if parts.length >= 3
112
+ frontmatter = YAML.safe_load(parts[1], permitted_classes: [Time]) || {}
113
+ return [frontmatter, parts[2].lstrip]
114
+ end
115
+ end
116
+
117
+ [{}, body]
118
+ end
119
+
120
+ def resolve_token_count(content, header)
121
+ if header
122
+ [header.to_i, false]
123
+ else
124
+ [(content.length / 4.0).ceil, true]
125
+ end
126
+ end
127
+
128
+ def process_html_response(url, response)
129
+ html = normalize_encoding(response.body.to_s, response.headers["content-type"])
130
+ document = Nokogiri::HTML(html)
131
+ title, article_html = extract_readable_content(html, document)
132
+ markdown = html_to_markdown(article_html)
133
+
134
+ Entry.new(
135
+ url:,
136
+ title:,
137
+ description: extract_description(document),
138
+ markdown:,
139
+ conversion: "local",
140
+ token_count: (markdown.length / 4.0).ceil,
141
+ token_estimated: true
142
+ )
143
+ end
144
+
145
+ def extract_readable_content(html, document)
146
+ title, article_html = run_readability(html)
147
+
148
+ if article_html.nil? || article_html.strip.empty?
149
+ warn "Warning: readability extracted no content, falling back to full body"
150
+ article_html = document.at("body")&.inner_html.to_s
151
+ end
152
+
153
+ title = extract_title(document) if title.empty?
154
+
155
+ [clean_title(title), article_html]
156
+ end
157
+
158
+ def run_readability(html)
159
+ readable = Readability::Document.new(html, tags: READABILITY_TAGS, attributes: READABILITY_ATTRIBUTES)
160
+ [readable.title.to_s, readable.content]
161
+ rescue StandardError
162
+ ["", nil]
163
+ end
164
+
165
+ def html_to_markdown(article_html)
166
+ article_html = article_html.encode("UTF-8", invalid: :replace, undef: :replace)
167
+ code_languages = extract_code_languages(article_html)
168
+ markdown = ReverseMarkdown.convert(article_html, github_flavored: true).force_encoding("UTF-8")
169
+ inject_code_languages(markdown, code_languages)
170
+ end
171
+
172
+ def extract_code_languages(html) # rubocop:disable Metrics/CyclomaticComplexity
173
+ fragment = Nokogiri::HTML.fragment(html)
174
+
175
+ fragment.css("pre").map do |pre|
176
+ pre["lang"] ||
177
+ pre["data-lang"] ||
178
+ pre.css("code").first&.[]("class")&.match(/language-(\w+)/)&.captures&.first # rubocop:disable Style/SafeNavigationChainLength
179
+ end
180
+ end
181
+
182
+ def inject_code_languages(markdown, languages) # rubocop:disable Metrics/MethodLength
183
+ index = 0
184
+
185
+ markdown.gsub(/^```\s*$/) do |match|
186
+ if index.even? && (index / 2) < languages.length
187
+ lang = languages[index / 2]
188
+ index += 1
189
+ lang ? "```#{lang}" : match
190
+ else
191
+ index += 1
192
+ match
193
+ end
194
+ end
195
+ end
196
+
197
+ def extract_description(document)
198
+ meta = document.at('meta[name="description"]')
199
+ meta ? meta["content"].to_s : ""
200
+ end
201
+
202
+ def extract_title(document)
203
+ title_tag = document.at("title")
204
+ title_tag ? title_tag.text.to_s.strip : ""
205
+ end
206
+
207
+ def clean_title(title)
208
+ title.sub(/\s*[|–—-]\s*[^|–—-]+\z/, "").strip
209
+ end
210
+ end
211
+
212
+ class FetchError < StandardError; end
213
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "yaml"
5
+
6
+ module Mddir
7
+ module GlobalIndex
8
+ def self.path(config)
9
+ File.join(config.base_dir, "index.yml")
10
+ end
11
+
12
+ def self.load(config)
13
+ file = path(config)
14
+ return update!(config) unless File.exist?(file)
15
+
16
+ data = YAML.safe_load_file(file, permitted_classes: [Time])
17
+ return update!(config) unless data.is_a?(Hash)
18
+
19
+ data
20
+ rescue Psych::SyntaxError
21
+ update!(config)
22
+ end
23
+
24
+ def self.update!(config)
25
+ FileUtils.mkdir_p(config.base_dir)
26
+
27
+ collections = build_collections(config)
28
+
29
+ data = {
30
+ "collections" => collections,
31
+ "total_entries" => collections.sum { |_, info| info["entry_count"] },
32
+ "last_updated" => Time.now.utc.iso8601
33
+ }
34
+
35
+ File.write(path(config), YAML.dump(data))
36
+
37
+ data
38
+ end
39
+
40
+ def self.build_collections(config)
41
+ Collection.all(config).to_h do |collection|
42
+ [collection.name, {
43
+ "entry_count" => collection.entry_count,
44
+ "last_added" => collection.last_added&.to_s
45
+ }]
46
+ end
47
+ end
48
+
49
+ private_class_method :build_collections
50
+ end
51
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mddir
4
+ class Search
5
+ Result = Struct.new(:collection_name, :entry, :matches)
6
+ Match = Struct.new(:line_number, :snippet)
7
+
8
+ def initialize(config)
9
+ @config = config
10
+ end
11
+
12
+ def search(query, collection_name: nil)
13
+ collections = resolve_collections(collection_name)
14
+ return [] if collections.empty?
15
+
16
+ SearchIndex.open(@config) do |index|
17
+ collections.each { |collection| index.ensure_current!(collection) }
18
+
19
+ rows = index.query(query, collection_names: collections.map(&:name))
20
+ build_results(collections, rows, query)
21
+ end
22
+ end
23
+
24
+ private
25
+
26
+ def resolve_collections(collection_name)
27
+ if collection_name
28
+ collection = Collection.new(collection_name, @config)
29
+ collection.exist? ? [collection] : []
30
+ else
31
+ Collection.all(@config)
32
+ end
33
+ end
34
+
35
+ def build_results(collections, rows, query)
36
+ entries_lookup = build_entries_lookup(collections)
37
+ grouped = rows.group_by { |row| [row["collection"], row["filename"]] }
38
+
39
+ grouped.filter_map do |(collection_name, filename), file_rows|
40
+ entry = entries_lookup.dig(collection_name, filename)
41
+ next unless entry
42
+
43
+ Result.new(collection_name:, entry:, matches: build_matches(file_rows, query))
44
+ end
45
+ end
46
+
47
+ def build_entries_lookup(collections)
48
+ collections.to_h do |collection|
49
+ [collection.name, collection.entries.to_h { |entry| [entry["filename"], entry] }]
50
+ end
51
+ end
52
+
53
+ def build_matches(file_rows, query)
54
+ file_rows.map do |row|
55
+ snippet = extract_snippet(row["content"], query)
56
+ Match.new(line_number: row["line_number"].to_i, snippet: snippet)
57
+ end
58
+ end
59
+
60
+ def extract_snippet(line, query) # rubocop:disable Metrics/AbcSize
61
+ line = line.strip
62
+ index = line.downcase.index(query.downcase)
63
+ return line[0, 120] unless index
64
+
65
+ start = [index - 40, 0].max
66
+ finish = [index + query.length + 80, line.length].min
67
+ snippet = line[start...finish]
68
+ snippet = "...#{snippet}" if start.positive?
69
+ snippet = "#{snippet}..." if finish < line.length
70
+ snippet
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sqlite3"
4
+
5
+ module Mddir
6
+ class SearchIndex
7
+ def self.open(config)
8
+ index = new(config)
9
+ yield index
10
+ ensure
11
+ index&.close
12
+ end
13
+
14
+ def initialize(config)
15
+ @db = SQLite3::Database.new(File.join(config.base_dir, "search.db"))
16
+ @db.results_as_hash = true
17
+ setup_schema
18
+ end
19
+
20
+ def ensure_current!(collection)
21
+ row = @db.get_first_row("SELECT indexed_at FROM meta WHERE collection = ?", collection.name)
22
+ return if row && row["indexed_at"] >= index_mtime(collection)
23
+
24
+ reindex(collection)
25
+ end
26
+
27
+ def query(text, collection_names:)
28
+ escaped = text.gsub('"', '""')
29
+ placeholders = (["?"] * collection_names.size).join(", ")
30
+
31
+ @db.execute(
32
+ "SELECT collection, filename, line_number, content FROM search_lines " \
33
+ "WHERE search_lines MATCH ? AND collection IN (#{placeholders}) ORDER BY rank",
34
+ ["\"#{escaped}\"", *collection_names]
35
+ )
36
+ end
37
+
38
+ def remove_collection!(collection_name)
39
+ @db.execute("DELETE FROM search_lines WHERE collection = ?", collection_name)
40
+ @db.execute("DELETE FROM meta WHERE collection = ?", collection_name)
41
+ end
42
+
43
+ def close
44
+ @db.close
45
+ end
46
+
47
+ private
48
+
49
+ def setup_schema
50
+ @db.execute_batch(<<~SQL)
51
+ CREATE VIRTUAL TABLE IF NOT EXISTS search_lines USING fts5(
52
+ collection UNINDEXED,
53
+ filename UNINDEXED,
54
+ line_number UNINDEXED,
55
+ content,
56
+ tokenize='trigram case_sensitive 0'
57
+ );
58
+
59
+ CREATE TABLE IF NOT EXISTS meta (
60
+ collection TEXT PRIMARY KEY,
61
+ indexed_at REAL NOT NULL
62
+ );
63
+ SQL
64
+ end
65
+
66
+ def reindex(collection)
67
+ @db.transaction do
68
+ @db.execute("DELETE FROM search_lines WHERE collection = ?", collection.name)
69
+ index_collection_files(collection)
70
+ update_meta(collection)
71
+ end
72
+ end
73
+
74
+ def index_collection_files(collection)
75
+ collection.entries.each do |entry|
76
+ file_path = File.join(collection.path, entry["filename"])
77
+ next unless File.exist?(file_path)
78
+
79
+ index_file(collection.name, entry["filename"], file_path)
80
+ end
81
+ end
82
+
83
+ def update_meta(collection)
84
+ @db.execute(
85
+ "INSERT OR REPLACE INTO meta (collection, indexed_at) VALUES (?, ?)",
86
+ [collection.name, index_mtime(collection)]
87
+ )
88
+ end
89
+
90
+ def index_file(collection_name, filename, file_path)
91
+ lines = File.readlines(file_path, encoding: "UTF-8")
92
+
93
+ Utils.skip_frontmatter(lines).each do |line_number, line|
94
+ next if line.strip.empty?
95
+
96
+ @db.execute(
97
+ "INSERT INTO search_lines (collection, filename, line_number, content) VALUES (?, ?, ?, ?)",
98
+ [collection_name, filename, line_number, line]
99
+ )
100
+ end
101
+ end
102
+
103
+ def index_mtime(collection)
104
+ File.exist?(collection.index_path) ? File.mtime(collection.index_path).to_f : 0.0
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sinatra/base"
4
+ require "kramdown"
5
+ require "kramdown-parser-gfm"
6
+ require "rouge"
7
+ require "uri"
8
+
9
+ module Mddir
10
+ class Server < Sinatra::Base # rubocop:disable Metrics/ClassLength
11
+ set :views, File.expand_path("../../views", __dir__)
12
+ set :public_folder, File.expand_path("../../public", __dir__)
13
+
14
+ enable :method_override
15
+
16
+ before do
17
+ @collection_names = Collection.all(config).map(&:name)
18
+ end
19
+
20
+ def self.start(config)
21
+ set :mddir_config, config
22
+ set :port, config.port
23
+ set :bind, "localhost"
24
+
25
+ puts "mddir server running at http://localhost:#{config.port}"
26
+ puts "Press Ctrl+C to stop"
27
+ run!
28
+ end
29
+
30
+ helpers do # rubocop:disable Metrics/BlockLength
31
+ def config
32
+ settings.mddir_config
33
+ end
34
+
35
+ def format_date(date_str)
36
+ return "" unless date_str
37
+
38
+ Time.parse(date_str.to_s).strftime("%b %d, %Y")
39
+ rescue ArgumentError
40
+ date_str.to_s
41
+ end
42
+
43
+ def domain_from_url(url)
44
+ URI.parse(url).host
45
+ rescue URI::InvalidURIError
46
+ url
47
+ end
48
+
49
+ def truncate(text, length = 200)
50
+ return "" unless text
51
+
52
+ text.length > length ? "#{text[0, length]}..." : text
53
+ end
54
+
55
+ def h(text)
56
+ Rack::Utils.escape_html(text.to_s)
57
+ end
58
+
59
+ def format_tokens(count)
60
+ return "" unless count
61
+
62
+ count >= 1000 ? "~#{(count / 1000.0).round(1)}k tokens" : "~#{count} tokens"
63
+ end
64
+
65
+ def highlight(text, query)
66
+ return h(text) unless query && !query.empty?
67
+
68
+ escaped_query = Regexp.escape(query)
69
+ h(text).gsub(/#{escaped_query}/i) { |m| "<mark>#{m}</mark>" }
70
+ end
71
+ end
72
+
73
+ get "/" do
74
+ @global = GlobalIndex.load(config)
75
+ @collections = (@global["collections"] || {}).sort_by { |name, _| name }
76
+
77
+ erb :home
78
+ end
79
+
80
+ get "/search" do
81
+ @query = params["q"].to_s.strip
82
+ @collection_filter = params["collection"]
83
+
84
+ if @query.empty?
85
+ @results = []
86
+ else
87
+ searcher = Search.new(config)
88
+ @results = searcher.search(@query, collection_name: @collection_filter)
89
+ end
90
+
91
+ erb :search
92
+ end
93
+
94
+ get "/:collection" do
95
+ collection = Collection.new(params[:collection], config)
96
+ halt 404, "Collection not found" unless collection.exist?
97
+
98
+ @collection = collection
99
+ @current_collection = collection.name
100
+ @entries = collection.entries.reverse
101
+
102
+ erb :collection
103
+ end
104
+
105
+ get "/:collection/:slug" do
106
+ collection = Collection.new(params[:collection], config)
107
+ halt 404, "Collection not found" unless collection.exist?
108
+
109
+ @collection = collection
110
+ @current_collection = collection.name
111
+ @entry = collection.entries.find { |entry| entry["slug"] == params[:slug] }
112
+ halt 404, "Entry not found" unless @entry
113
+
114
+ file_path = File.join(collection.path, @entry["filename"])
115
+ halt 404, "File not found" unless File.exist?(file_path)
116
+
117
+ raw = File.read(file_path, encoding: "UTF-8")
118
+ content = Utils.strip_frontmatter(raw)
119
+ @html_content = Kramdown::Document.new(
120
+ content,
121
+ input: "GFM",
122
+ syntax_highlighter: :rouge,
123
+ syntax_highlighter_opts: {
124
+ default_lang: "plaintext"
125
+ }
126
+ ).to_html
127
+
128
+ erb :reader
129
+ end
130
+
131
+ delete "/:collection" do
132
+ collection = Collection.new(params[:collection], config)
133
+ halt 404, "Collection not found" unless collection.exist?
134
+
135
+ collection.remove!
136
+
137
+ redirect "/"
138
+ end
139
+
140
+ delete "/:collection/:slug" do
141
+ collection = Collection.new(params[:collection], config)
142
+ halt 404, "Collection not found" unless collection.exist?
143
+
144
+ entry = collection.entries.find { |e| e["slug"] == params[:slug] }
145
+ halt 404, "Entry not found" unless entry
146
+
147
+ collection.remove_entry(entry["slug"])
148
+
149
+ redirect "/#{collection.name}"
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mddir
4
+ module Utils
5
+ def self.slugify(text)
6
+ text.downcase
7
+ .gsub(/[^a-z0-9]+/, "-")
8
+ .gsub(/-{2,}/, "-")
9
+ .gsub(/\A-|-\z/, "")
10
+ end
11
+
12
+ def self.strip_frontmatter(text)
13
+ if text.start_with?("---")
14
+ parts = text.split("---", 3)
15
+ parts.length >= 3 ? parts[2].lstrip : text
16
+ else
17
+ text
18
+ end
19
+ end
20
+
21
+ def self.skip_frontmatter(lines) # rubocop:disable Metrics/MethodLength
22
+ result = []
23
+ in_frontmatter = false
24
+
25
+ lines.each_with_index do |line, index|
26
+ line_number = index + 1
27
+
28
+ if line_number == 1 && line.strip == "---"
29
+ in_frontmatter = true
30
+ next
31
+ end
32
+
33
+ if in_frontmatter && line.strip == "---"
34
+ in_frontmatter = false
35
+ next
36
+ end
37
+
38
+ next if in_frontmatter
39
+
40
+ result << [line_number, line.chomp]
41
+ end
42
+
43
+ result
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mddir
4
+ VERSION = "0.1.0"
5
+ end
data/lib/mddir.rb ADDED
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "mddir/version"
4
+ require_relative "mddir/utils"
5
+ require_relative "mddir/config"
6
+ require_relative "mddir/global_index"
7
+ require_relative "mddir/collection"
8
+ require_relative "mddir/entry"
9
+ require_relative "mddir/fetcher"
10
+ require_relative "mddir/search_index"
11
+ require_relative "mddir/search"
12
+ require_relative "mddir/cli"
13
+
14
+ module Mddir
15
+ class Error < StandardError; end
16
+ end