openstax_content 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,2 @@
1
+ # content-ruby
2
+ Ruby bindings to read and parse the OpenStax ABL and the content archive
@@ -0,0 +1,14 @@
1
+ module OpenStax
2
+ module Content
3
+ class << self
4
+ attr_accessor :abl_url, :archive_path, :bucket_name, :domain, :exercises_search_api_url,
5
+ :logger, :s3_region, :s3_access_key_id, :s3_secret_access_key
6
+
7
+ def configure
8
+ yield self
9
+ end
10
+ end
11
+ end
12
+ end
13
+
14
+ Dir["#{__dir__}/content/**/*.rb"].each { |file| require file }
@@ -0,0 +1,13 @@
1
+ class OpenStax::Content::Abl
2
+ def body
3
+ @body ||= JSON.parse(Faraday.get(OpenStax::Content.abl_url).body).deep_symbolize_keys
4
+ end
5
+
6
+ def approved_books
7
+ body[:approved_books]
8
+ end
9
+
10
+ def approved_versions
11
+ body[:approved_versions]
12
+ end
13
+ end
@@ -0,0 +1,104 @@
1
+ require 'addressable/uri'
2
+ require 'faraday'
3
+
4
+ class OpenStax::Content::Archive
5
+ def initialize(version)
6
+ @version = version
7
+ @slugs = {}
8
+ end
9
+
10
+ def base_url
11
+ @base_url ||= "https://#{OpenStax::Content.domain}/#{
12
+ OpenStax::Content.archive_path}/#{@version}"
13
+ end
14
+
15
+ def url_for(object)
16
+ return if object.nil?
17
+
18
+ begin
19
+ uri = Addressable::URI.parse object
20
+ rescue Addressable::URI::InvalidURIError
21
+ begin
22
+ uri = Addressable::URI.parse "/#{object}"
23
+ rescue Addressable::URI::InvalidURIError
24
+ OpenStax::Content.logger.warn { "Invalid url: \"#{object}\" in archive link" }
25
+
26
+ return object
27
+ end
28
+ end
29
+
30
+ if uri.absolute?
31
+ OpenStax::Content.logger.warn do
32
+ "#{self.class.name} received an unexpected absolute URL in url_for: \"#{object}\""
33
+ end
34
+
35
+ # Force absolute URLs to be https
36
+ uri.scheme = 'https'
37
+ return uri.to_s
38
+ end
39
+
40
+ if uri.path.empty?
41
+ OpenStax::Content.logger.warn do
42
+ "#{self.class.name} received an unexpected fragment-only URL in url_for: \"#{object}\""
43
+ end
44
+
45
+ return object
46
+ end
47
+
48
+ if uri.path.start_with?('../')
49
+ uri.path = uri.path.sub('..', '')
50
+ "#{base_url}#{uri.to_s}"
51
+ else
52
+ uri.path = "#{uri.path.chomp('.json').chomp('.xhtml')}.json"
53
+
54
+ uri.path.start_with?('/') ? "#{base_url}/contents#{uri.to_s}" :
55
+ "#{base_url}/contents/#{uri.to_s}"
56
+ end
57
+ end
58
+
59
+ def fetch(object)
60
+ url = url_for object
61
+ OpenStax::Content.logger.debug { "Fetching #{url}" }
62
+ Faraday.get(url).body
63
+ end
64
+
65
+ def json(object)
66
+ begin
67
+ JSON.parse(fetch(object)).tap do |hash|
68
+ @slugs[object] = hash['slug']
69
+ end
70
+ rescue JSON::ParserError => err
71
+ raise "OpenStax Content Archive returned invalid JSON for #{url_for object}: #{err.message}"
72
+ end
73
+ end
74
+
75
+ def s3
76
+ @s3 ||= OpenStax::Content::S3.new
77
+ end
78
+
79
+ def add_latest_book_version_if_missing(object)
80
+ book_id, page_id = object.split(':', 2)
81
+ book_uuid, book_version = book_id.split('@', 2)
82
+ return object unless book_version.nil? && s3.bucket_configured?
83
+
84
+ s3.ls(@version).each do |book|
85
+ uuid, version = book.split('@')
86
+ next unless uuid == book_uuid
87
+
88
+ book_version = version
89
+ break
90
+ end
91
+
92
+ book_id = "#{book_uuid}@#{book_version}".chomp('@')
93
+ "#{book_id}:#{page_id}".chomp(':')
94
+ end
95
+
96
+ def slug(object)
97
+ @slugs[object] ||= begin
98
+ object_with_version = add_latest_book_version_if_missing object
99
+ slug = json(object_with_version)['slug']
100
+ @slugs[object_with_version] = slug if object_with_version != object
101
+ slug
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,70 @@
1
+ require_relative 'archive'
2
+ require_relative 'book_part'
3
+
4
+ class OpenStax::Content::Book
5
+ def initialize(
6
+ archive_version:, uuid: nil, version: nil, hash: nil, title: nil, tree: nil, root_book_part: nil
7
+ )
8
+ @uuid = uuid || (hash || {})['id']
9
+ raise ArgumentError, 'Either uuid or hash with id key is required' if @uuid.nil?
10
+
11
+ @version = version || (hash || {})['version']
12
+ raise ArgumentError, 'Either version or hash with version key is required' if @version.nil?
13
+
14
+ @archive_version = archive_version
15
+ @hash = hash
16
+ @title = title
17
+ @tree = tree
18
+ @root_book_part = root_book_part
19
+ end
20
+
21
+ attr_reader :archive_version, :uuid, :version
22
+
23
+ def archive
24
+ @archive ||= OpenStax::Content::Archive.new archive_version
25
+ end
26
+
27
+ def url
28
+ @url ||= archive.url_for "#{uuid}@#{version}"
29
+ end
30
+
31
+ def url_fragment
32
+ @url_fragment ||= url.chomp('.json')
33
+ end
34
+
35
+ def baked
36
+ @baked ||= hash['baked']
37
+ end
38
+
39
+ def collated
40
+ @collated ||= hash.fetch('collated', false)
41
+ end
42
+
43
+ def hash
44
+ @hash ||= archive.json url
45
+ end
46
+
47
+ def uuid
48
+ @uuid ||= hash.fetch('id')
49
+ end
50
+
51
+ def short_id
52
+ @short_id ||= hash['shortId']
53
+ end
54
+
55
+ def version
56
+ @version ||= hash.fetch('version')
57
+ end
58
+
59
+ def title
60
+ @title ||= hash.fetch('title')
61
+ end
62
+
63
+ def tree
64
+ @tree ||= hash.fetch('tree')
65
+ end
66
+
67
+ def root_book_part
68
+ @root_book_part ||= OpenStax::Content::BookPart.new(hash: tree, is_root: true, book: self)
69
+ end
70
+ end
@@ -0,0 +1,47 @@
1
+ require_relative 'title'
2
+ require_relative 'page'
3
+
4
+ class OpenStax::Content::BookPart
5
+ def initialize(hash: {}, is_root: false, book: nil)
6
+ @hash = hash
7
+ @is_root = is_root
8
+ @book = book
9
+ end
10
+
11
+ attr_reader :hash, :is_root, :book
12
+
13
+ def parsed_title
14
+ @parsed_title ||= OpenStax::Content::Title.new hash.fetch('title')
15
+ end
16
+
17
+ def book_location
18
+ @book_location ||= parsed_title.book_location
19
+ end
20
+
21
+ def title
22
+ @title ||= parsed_title.text
23
+ end
24
+
25
+ # Old content used to have id == "subcol" for units and chapters
26
+ # If we encounter that, just assign a random UUID to them
27
+ def uuid
28
+ @uuid ||= begin
29
+ uuid = hash['id']
30
+ uuid.nil? || uuid == 'subcol' ? SecureRandom.uuid : uuid.split('@').first
31
+ end
32
+ end
33
+
34
+ def contents
35
+ @contents ||= hash.fetch('contents')
36
+ end
37
+
38
+ def parts
39
+ @parts ||= contents.map do |hash|
40
+ if hash.has_key? 'contents'
41
+ self.class.new book: book, hash: hash
42
+ else
43
+ OpenStax::Content::Page.new book: book, hash: hash
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,9 @@
1
+ require 'singleton'
2
+
3
+ class OpenStax::Content::CustomCss
4
+ include Singleton
5
+
6
+ define_method(:'has-descendants') do |node_set, selector, number = 1|
7
+ node_set.select { |node| node.css(selector).size >= number }
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ # All fragment subclasses must be serializable with to_yaml and YAML.load
2
+ # Nokogiri nodes are not serializable, so they must be processed in the initialize method
3
+ class OpenStax::Content::Fragment
4
+ attr_reader :title, :labels, :node_id
5
+
6
+ def initialize(node:, title: nil, labels: nil)
7
+ @title = title
8
+ @labels = labels || []
9
+ @node_id = node[:id]
10
+ end
11
+
12
+ def blank?
13
+ false
14
+ end
15
+
16
+ def html?
17
+ false
18
+ end
19
+ end
@@ -0,0 +1,67 @@
1
+ require_relative 'html'
2
+
3
+ class OpenStax::Content::Fragment::Embedded < OpenStax::Content::Fragment::Html
4
+ # Used to get the title
5
+ TITLE_CSS = '[data-type="title"]'
6
+
7
+ # Used to get the title if there are no title nodes
8
+ LABEL_ATTRIBUTE = '[data-label]'
9
+
10
+ # CSS to find embedded content urls
11
+ TAGGED_URL_CSS = 'iframe.os-embed, a.os-embed, .os-embed iframe, .os-embed a'
12
+ UNTAGGED_URL_CSS = 'iframe, a'
13
+
14
+ CLASS_ATTRIBUTES = [ :iframe_classes, :iframe_title, :default_width, :default_height ]
15
+
16
+ class << self
17
+ attr_accessor *CLASS_ATTRIBUTES
18
+
19
+ def inherited(subclass)
20
+ CLASS_ATTRIBUTES.each do |class_attribute|
21
+ subclass.send "#{class_attribute}=", send(class_attribute)
22
+ end
23
+ end
24
+ end
25
+
26
+ self.iframe_classes = ['os-embed']
27
+ self.iframe_title = ''
28
+
29
+ CLASS_ATTRIBUTES.each do |class_attribute|
30
+ define_method(class_attribute) { self.class.send class_attribute }
31
+ end
32
+
33
+ attr_reader :url, :width, :height
34
+
35
+ def initialize(node:, title: nil, labels: nil)
36
+ super
37
+
38
+ @title ||= begin
39
+ title_nodes = @node.css(TITLE_CSS)
40
+ titles = title_nodes.empty? ? @node.css(LABEL_ATTRIBUTE).map do |label|
41
+ label.attr('data-label')
42
+ end : title_nodes.map { |node| node.content.strip }
43
+ titles.uniq.join('; ')
44
+ end
45
+
46
+ url_node = @node.at_css(TAGGED_URL_CSS) || @node.css(UNTAGGED_URL_CSS).last
47
+
48
+ @width = url_node&.[]('width') || default_width
49
+ @height = url_node&.[]('height') || default_height
50
+ @url = url_node&.[]('src') || url_node&.[]('href')
51
+
52
+ if url_node&.name == 'iframe'
53
+ node_classes = url_node['class'].to_s.split(' ') + iframe_classes
54
+ url_node['class'] = node_classes.uniq.join(' ')
55
+ url_node['title'] ||= iframe_title
56
+ # To always force the default iframe size, change ||= to =
57
+ url_node['width'] ||= default_width
58
+ url_node['height'] ||= default_height
59
+ end
60
+
61
+ @to_html = @node.to_html
62
+ end
63
+
64
+ def blank?
65
+ url.nil? || url.empty?
66
+ end
67
+ end
@@ -0,0 +1,56 @@
1
+ require_relative '../fragment'
2
+
3
+ class OpenStax::Content::Fragment::Exercise < OpenStax::Content::Fragment
4
+ # CSS to find the embed code attributes
5
+ EXERCISE_EMBED_URL_CSS = 'a[href^="#"]'
6
+
7
+ # Regex to extract the appropriate tag from the embed code(s)
8
+ EXERCISE_EMBED_URL_REGEXES = {
9
+ tag: /\A#ost\/api\/ex\/([\w\s-]+)\z/,
10
+ nickname: /\A#exercises?\/([\w\s-]+)\z/
11
+ }
12
+
13
+ # CSS to find the exercise embed queries after the urls are absolutized
14
+ ABSOLUTIZED_EMBED_URL_CSS = 'a[href*="/api/exercises"]'
15
+
16
+ # Regex to extract the appropriate embed queries from the absolutized urls
17
+ ABSOLUTIZED_EMBED_URL_REGEX = \
18
+ /\/api\/exercises\/?\?q=(tag|nickname)(?::|%3A)(?:"|%22)?([\w\s%-]+?)(?:"|%22)?\z/
19
+
20
+ attr_reader :embed_queries
21
+
22
+ # This code is run from lib/openstax/cnx/v1/page.rb during import
23
+ def self.absolutize_exercise_urls!(node)
24
+ uri = Addressable::URI.parse OpenStax::Content.exercises_search_api_url
25
+
26
+ node.css(EXERCISE_EMBED_URL_CSS).each do |anchor|
27
+ href = anchor.attribute('href')
28
+
29
+ EXERCISE_EMBED_URL_REGEXES.each do |field, regex|
30
+ embed_match = regex.match(href.value)
31
+ next if embed_match.nil?
32
+
33
+ uri.query_values = { q: "#{field}:\"#{embed_match[1]}\"" }
34
+ href.value = uri.to_s
35
+ anchor['data-type'] = 'exercise'
36
+ break
37
+ end
38
+ end
39
+ end
40
+
41
+ def initialize(node:, title: nil, labels: [])
42
+ super
43
+
44
+ @embed_queries = node.css(ABSOLUTIZED_EMBED_URL_CSS).map do |anchor|
45
+ url = anchor.attribute('href').value
46
+ match = ABSOLUTIZED_EMBED_URL_REGEX.match(url)
47
+ next if match.nil?
48
+
49
+ [ match[1].to_sym, URI.decode_www_form_component(match[2]) ]
50
+ end.compact
51
+ end
52
+
53
+ def blank?
54
+ embed_queries.empty? && (node_id.nil? || node_id.empty?)
55
+ end
56
+ end
@@ -0,0 +1,62 @@
1
+ require 'addressable/uri'
2
+
3
+ require_relative '../fragment'
4
+
5
+ class OpenStax::Content::Fragment::Html < OpenStax::Content::Fragment
6
+ attr_reader :to_html
7
+
8
+ def initialize(node:, title: nil, labels: nil)
9
+ super
10
+
11
+ @node = Nokogiri::HTML.fragment node.to_html
12
+ @to_html = @node.to_html
13
+ end
14
+
15
+ def as_json(*args)
16
+ # Don't attempt to serialize @node (it would fail)
17
+ super.except('node')
18
+ end
19
+
20
+ def html?
21
+ !to_html.empty?
22
+ end
23
+
24
+ def blank?
25
+ !html?
26
+ end
27
+
28
+ def node
29
+ @node ||= Nokogiri::HTML.fragment to_html
30
+ end
31
+
32
+ def has_css?(css, custom_css)
33
+ !node.at_css(css, custom_css).nil?
34
+ end
35
+
36
+ def append(new_node)
37
+ (node.at_css('body') || node.root) << new_node
38
+
39
+ @to_html = node.to_html
40
+ end
41
+
42
+ def transform_links!
43
+ node.css('[href]').each do |link|
44
+ href = link.attributes['href']
45
+ uri = Addressable::URI.parse(href.value) rescue nil
46
+
47
+ # Modify only fragment-only links
48
+ next if uri.nil? || uri.absolute? || !uri.path.empty?
49
+
50
+ # Abort if there is no target or it contains double quotes
51
+ # or it's still present in this fragment
52
+ target = uri.fragment
53
+ next if target.nil? || target.empty? || target.include?('"') ||
54
+ node.at_css("[id=\"#{target}\"], [name=\"#{target}\"]")
55
+
56
+ # Change the link to point to the reference view
57
+ href.value = "#{@reference_view_url}##{target}"
58
+ end unless @reference_view_url.nil?
59
+
60
+ @to_html = node.to_html
61
+ end
62
+ end