openstax_content 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,2 @@
1
+ # content-ruby
2
+ Ruby bindings to read and parse the OpenStax ABL and the content archive
@@ -0,0 +1,14 @@
1
+ module OpenStax
2
+ module Content
3
+ class << self
4
+ attr_accessor :abl_url, :archive_path, :bucket_name, :domain, :exercises_search_api_url,
5
+ :logger, :s3_region, :s3_access_key_id, :s3_secret_access_key
6
+
7
+ def configure
8
+ yield self
9
+ end
10
+ end
11
+ end
12
+ end
13
+
14
+ Dir["#{__dir__}/content/**/*.rb"].each { |file| require file }
@@ -0,0 +1,13 @@
1
+ class OpenStax::Content::Abl
2
+ def body
3
+ @body ||= JSON.parse(Faraday.get(OpenStax::Content.abl_url).body).deep_symbolize_keys
4
+ end
5
+
6
+ def approved_books
7
+ body[:approved_books]
8
+ end
9
+
10
+ def approved_versions
11
+ body[:approved_versions]
12
+ end
13
+ end
@@ -0,0 +1,104 @@
1
+ require 'addressable/uri'
2
+ require 'faraday'
3
+
4
+ class OpenStax::Content::Archive
5
+ def initialize(version)
6
+ @version = version
7
+ @slugs = {}
8
+ end
9
+
10
+ def base_url
11
+ @base_url ||= "https://#{OpenStax::Content.domain}/#{
12
+ OpenStax::Content.archive_path}/#{@version}"
13
+ end
14
+
15
+ def url_for(object)
16
+ return if object.nil?
17
+
18
+ begin
19
+ uri = Addressable::URI.parse object
20
+ rescue Addressable::URI::InvalidURIError
21
+ begin
22
+ uri = Addressable::URI.parse "/#{object}"
23
+ rescue Addressable::URI::InvalidURIError
24
+ OpenStax::Content.logger.warn { "Invalid url: \"#{object}\" in archive link" }
25
+
26
+ return object
27
+ end
28
+ end
29
+
30
+ if uri.absolute?
31
+ OpenStax::Content.logger.warn do
32
+ "#{self.class.name} received an unexpected absolute URL in url_for: \"#{object}\""
33
+ end
34
+
35
+ # Force absolute URLs to be https
36
+ uri.scheme = 'https'
37
+ return uri.to_s
38
+ end
39
+
40
+ if uri.path.empty?
41
+ OpenStax::Content.logger.warn do
42
+ "#{self.class.name} received an unexpected fragment-only URL in url_for: \"#{object}\""
43
+ end
44
+
45
+ return object
46
+ end
47
+
48
+ if uri.path.start_with?('../')
49
+ uri.path = uri.path.sub('..', '')
50
+ "#{base_url}#{uri.to_s}"
51
+ else
52
+ uri.path = "#{uri.path.chomp('.json').chomp('.xhtml')}.json"
53
+
54
+ uri.path.start_with?('/') ? "#{base_url}/contents#{uri.to_s}" :
55
+ "#{base_url}/contents/#{uri.to_s}"
56
+ end
57
+ end
58
+
59
+ def fetch(object)
60
+ url = url_for object
61
+ OpenStax::Content.logger.debug { "Fetching #{url}" }
62
+ Faraday.get(url).body
63
+ end
64
+
65
+ def json(object)
66
+ begin
67
+ JSON.parse(fetch(object)).tap do |hash|
68
+ @slugs[object] = hash['slug']
69
+ end
70
+ rescue JSON::ParserError => err
71
+ raise "OpenStax Content Archive returned invalid JSON for #{url_for object}: #{err.message}"
72
+ end
73
+ end
74
+
75
+ def s3
76
+ @s3 ||= OpenStax::Content::S3.new
77
+ end
78
+
79
+ def add_latest_book_version_if_missing(object)
80
+ book_id, page_id = object.split(':', 2)
81
+ book_uuid, book_version = book_id.split('@', 2)
82
+ return object unless book_version.nil? && s3.bucket_configured?
83
+
84
+ s3.ls(@version).each do |book|
85
+ uuid, version = book.split('@')
86
+ next unless uuid == book_uuid
87
+
88
+ book_version = version
89
+ break
90
+ end
91
+
92
+ book_id = "#{book_uuid}@#{book_version}".chomp('@')
93
+ "#{book_id}:#{page_id}".chomp(':')
94
+ end
95
+
96
+ def slug(object)
97
+ @slugs[object] ||= begin
98
+ object_with_version = add_latest_book_version_if_missing object
99
+ slug = json(object_with_version)['slug']
100
+ @slugs[object_with_version] = slug if object_with_version != object
101
+ slug
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,70 @@
1
+ require_relative 'archive'
2
+ require_relative 'book_part'
3
+
4
+ class OpenStax::Content::Book
5
+ def initialize(
6
+ archive_version:, uuid: nil, version: nil, hash: nil, title: nil, tree: nil, root_book_part: nil
7
+ )
8
+ @uuid = uuid || (hash || {})['id']
9
+ raise ArgumentError, 'Either uuid or hash with id key is required' if @uuid.nil?
10
+
11
+ @version = version || (hash || {})['version']
12
+ raise ArgumentError, 'Either version or hash with version key is required' if @version.nil?
13
+
14
+ @archive_version = archive_version
15
+ @hash = hash
16
+ @title = title
17
+ @tree = tree
18
+ @root_book_part = root_book_part
19
+ end
20
+
21
+ attr_reader :archive_version, :uuid, :version
22
+
23
+ def archive
24
+ @archive ||= OpenStax::Content::Archive.new archive_version
25
+ end
26
+
27
+ def url
28
+ @url ||= archive.url_for "#{uuid}@#{version}"
29
+ end
30
+
31
+ def url_fragment
32
+ @url_fragment ||= url.chomp('.json')
33
+ end
34
+
35
+ def baked
36
+ @baked ||= hash['baked']
37
+ end
38
+
39
+ def collated
40
+ @collated ||= hash.fetch('collated', false)
41
+ end
42
+
43
+ def hash
44
+ @hash ||= archive.json url
45
+ end
46
+
47
+ def uuid
48
+ @uuid ||= hash.fetch('id')
49
+ end
50
+
51
+ def short_id
52
+ @short_id ||= hash['shortId']
53
+ end
54
+
55
+ def version
56
+ @version ||= hash.fetch('version')
57
+ end
58
+
59
+ def title
60
+ @title ||= hash.fetch('title')
61
+ end
62
+
63
+ def tree
64
+ @tree ||= hash.fetch('tree')
65
+ end
66
+
67
+ def root_book_part
68
+ @root_book_part ||= OpenStax::Content::BookPart.new(hash: tree, is_root: true, book: self)
69
+ end
70
+ end
@@ -0,0 +1,47 @@
1
+ require_relative 'title'
2
+ require_relative 'page'
3
+
4
+ class OpenStax::Content::BookPart
5
+ def initialize(hash: {}, is_root: false, book: nil)
6
+ @hash = hash
7
+ @is_root = is_root
8
+ @book = book
9
+ end
10
+
11
+ attr_reader :hash, :is_root, :book
12
+
13
+ def parsed_title
14
+ @parsed_title ||= OpenStax::Content::Title.new hash.fetch('title')
15
+ end
16
+
17
+ def book_location
18
+ @book_location ||= parsed_title.book_location
19
+ end
20
+
21
+ def title
22
+ @title ||= parsed_title.text
23
+ end
24
+
25
+ # Old content used to have id == "subcol" for units and chapters
26
+ # If we encounter that, just assign a random UUID to them
27
+ def uuid
28
+ @uuid ||= begin
29
+ uuid = hash['id']
30
+ uuid.nil? || uuid == 'subcol' ? SecureRandom.uuid : uuid.split('@').first
31
+ end
32
+ end
33
+
34
+ def contents
35
+ @contents ||= hash.fetch('contents')
36
+ end
37
+
38
+ def parts
39
+ @parts ||= contents.map do |hash|
40
+ if hash.has_key? 'contents'
41
+ self.class.new book: book, hash: hash
42
+ else
43
+ OpenStax::Content::Page.new book: book, hash: hash
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,9 @@
1
+ require 'singleton'
2
+
3
+ class OpenStax::Content::CustomCss
4
+ include Singleton
5
+
6
+ define_method(:'has-descendants') do |node_set, selector, number = 1|
7
+ node_set.select { |node| node.css(selector).size >= number }
8
+ end
9
+ end
@@ -0,0 +1,19 @@
1
+ # All fragment subclasses must be serializable with to_yaml and YAML.load
2
+ # Nokogiri nodes are not serializable, so they must be processed in the initialize method
3
+ class OpenStax::Content::Fragment
4
+ attr_reader :title, :labels, :node_id
5
+
6
+ def initialize(node:, title: nil, labels: nil)
7
+ @title = title
8
+ @labels = labels || []
9
+ @node_id = node[:id]
10
+ end
11
+
12
+ def blank?
13
+ false
14
+ end
15
+
16
+ def html?
17
+ false
18
+ end
19
+ end
@@ -0,0 +1,67 @@
1
+ require_relative 'html'
2
+
3
+ class OpenStax::Content::Fragment::Embedded < OpenStax::Content::Fragment::Html
4
+ # Used to get the title
5
+ TITLE_CSS = '[data-type="title"]'
6
+
7
+ # Used to get the title if there are no title nodes
8
+ LABEL_ATTRIBUTE = '[data-label]'
9
+
10
+ # CSS to find embedded content urls
11
+ TAGGED_URL_CSS = 'iframe.os-embed, a.os-embed, .os-embed iframe, .os-embed a'
12
+ UNTAGGED_URL_CSS = 'iframe, a'
13
+
14
+ CLASS_ATTRIBUTES = [ :iframe_classes, :iframe_title, :default_width, :default_height ]
15
+
16
+ class << self
17
+ attr_accessor *CLASS_ATTRIBUTES
18
+
19
+ def inherited(subclass)
20
+ CLASS_ATTRIBUTES.each do |class_attribute|
21
+ subclass.send "#{class_attribute}=", send(class_attribute)
22
+ end
23
+ end
24
+ end
25
+
26
+ self.iframe_classes = ['os-embed']
27
+ self.iframe_title = ''
28
+
29
+ CLASS_ATTRIBUTES.each do |class_attribute|
30
+ define_method(class_attribute) { self.class.send class_attribute }
31
+ end
32
+
33
+ attr_reader :url, :width, :height
34
+
35
+ def initialize(node:, title: nil, labels: nil)
36
+ super
37
+
38
+ @title ||= begin
39
+ title_nodes = @node.css(TITLE_CSS)
40
+ titles = title_nodes.empty? ? @node.css(LABEL_ATTRIBUTE).map do |label|
41
+ label.attr('data-label')
42
+ end : title_nodes.map { |node| node.content.strip }
43
+ titles.uniq.join('; ')
44
+ end
45
+
46
+ url_node = @node.at_css(TAGGED_URL_CSS) || @node.css(UNTAGGED_URL_CSS).last
47
+
48
+ @width = url_node&.[]('width') || default_width
49
+ @height = url_node&.[]('height') || default_height
50
+ @url = url_node&.[]('src') || url_node&.[]('href')
51
+
52
+ if url_node&.name == 'iframe'
53
+ node_classes = url_node['class'].to_s.split(' ') + iframe_classes
54
+ url_node['class'] = node_classes.uniq.join(' ')
55
+ url_node['title'] ||= iframe_title
56
+ # To always force the default iframe size, change ||= to =
57
+ url_node['width'] ||= default_width
58
+ url_node['height'] ||= default_height
59
+ end
60
+
61
+ @to_html = @node.to_html
62
+ end
63
+
64
+ def blank?
65
+ url.nil? || url.empty?
66
+ end
67
+ end
@@ -0,0 +1,56 @@
1
+ require_relative '../fragment'
2
+
3
+ class OpenStax::Content::Fragment::Exercise < OpenStax::Content::Fragment
4
+ # CSS to find the embed code attributes
5
+ EXERCISE_EMBED_URL_CSS = 'a[href^="#"]'
6
+
7
+ # Regex to extract the appropriate tag from the embed code(s)
8
+ EXERCISE_EMBED_URL_REGEXES = {
9
+ tag: /\A#ost\/api\/ex\/([\w\s-]+)\z/,
10
+ nickname: /\A#exercises?\/([\w\s-]+)\z/
11
+ }
12
+
13
+ # CSS to find the exercise embed queries after the urls are absolutized
14
+ ABSOLUTIZED_EMBED_URL_CSS = 'a[href*="/api/exercises"]'
15
+
16
+ # Regex to extract the appropriate embed queries from the absolutized urls
17
+ ABSOLUTIZED_EMBED_URL_REGEX = \
18
+ /\/api\/exercises\/?\?q=(tag|nickname)(?::|%3A)(?:"|%22)?([\w\s%-]+?)(?:"|%22)?\z/
19
+
20
+ attr_reader :embed_queries
21
+
22
+ # This code is run from lib/openstax/cnx/v1/page.rb during import
23
+ def self.absolutize_exercise_urls!(node)
24
+ uri = Addressable::URI.parse OpenStax::Content.exercises_search_api_url
25
+
26
+ node.css(EXERCISE_EMBED_URL_CSS).each do |anchor|
27
+ href = anchor.attribute('href')
28
+
29
+ EXERCISE_EMBED_URL_REGEXES.each do |field, regex|
30
+ embed_match = regex.match(href.value)
31
+ next if embed_match.nil?
32
+
33
+ uri.query_values = { q: "#{field}:\"#{embed_match[1]}\"" }
34
+ href.value = uri.to_s
35
+ anchor['data-type'] = 'exercise'
36
+ break
37
+ end
38
+ end
39
+ end
40
+
41
+ def initialize(node:, title: nil, labels: [])
42
+ super
43
+
44
+ @embed_queries = node.css(ABSOLUTIZED_EMBED_URL_CSS).map do |anchor|
45
+ url = anchor.attribute('href').value
46
+ match = ABSOLUTIZED_EMBED_URL_REGEX.match(url)
47
+ next if match.nil?
48
+
49
+ [ match[1].to_sym, URI.decode_www_form_component(match[2]) ]
50
+ end.compact
51
+ end
52
+
53
+ def blank?
54
+ embed_queries.empty? && (node_id.nil? || node_id.empty?)
55
+ end
56
+ end
@@ -0,0 +1,62 @@
1
+ require 'addressable/uri'
2
+
3
+ require_relative '../fragment'
4
+
5
+ class OpenStax::Content::Fragment::Html < OpenStax::Content::Fragment
6
+ attr_reader :to_html
7
+
8
+ def initialize(node:, title: nil, labels: nil)
9
+ super
10
+
11
+ @node = Nokogiri::HTML.fragment node.to_html
12
+ @to_html = @node.to_html
13
+ end
14
+
15
+ def as_json(*args)
16
+ # Don't attempt to serialize @node (it would fail)
17
+ super.except('node')
18
+ end
19
+
20
+ def html?
21
+ !to_html.empty?
22
+ end
23
+
24
+ def blank?
25
+ !html?
26
+ end
27
+
28
+ def node
29
+ @node ||= Nokogiri::HTML.fragment to_html
30
+ end
31
+
32
+ def has_css?(css, custom_css)
33
+ !node.at_css(css, custom_css).nil?
34
+ end
35
+
36
+ def append(new_node)
37
+ (node.at_css('body') || node.root) << new_node
38
+
39
+ @to_html = node.to_html
40
+ end
41
+
42
+ def transform_links!
43
+ node.css('[href]').each do |link|
44
+ href = link.attributes['href']
45
+ uri = Addressable::URI.parse(href.value) rescue nil
46
+
47
+ # Modify only fragment-only links
48
+ next if uri.nil? || uri.absolute? || !uri.path.empty?
49
+
50
+ # Abort if there is no target or it contains double quotes
51
+ # or it's still present in this fragment
52
+ target = uri.fragment
53
+ next if target.nil? || target.empty? || target.include?('"') ||
54
+ node.at_css("[id=\"#{target}\"], [name=\"#{target}\"]")
55
+
56
+ # Change the link to point to the reference view
57
+ href.value = "#{@reference_view_url}##{target}"
58
+ end unless @reference_view_url.nil?
59
+
60
+ @to_html = node.to_html
61
+ end
62
+ end