RubyGems - udise_school_report_reader - Versions diffs - 0.1.0 - Mend

udise_school_report_reader 0.1.0

Files changed (35) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +20 -0
data/LICENSE.txt +21 -0
data/README.md +45 -0
data/lib/udise_school_report_reader/activities_data_reader.rb +58 -0
data/lib/udise_school_report_reader/anganwadi_data_reader.rb +22 -0
data/lib/udise_school_report_reader/basic_info_data_reader.rb +29 -0
data/lib/udise_school_report_reader/block_rectangle_combiner.rb +115 -0
data/lib/udise_school_report_reader/building_data_reader.rb +36 -0
data/lib/udise_school_report_reader/characteristics_reader.rb +28 -0
data/lib/udise_school_report_reader/csv_writer.rb +75 -0
data/lib/udise_school_report_reader/data_reader_base.rb +86 -0
data/lib/udise_school_report_reader/digital_facilities_data_reader.rb +42 -0
data/lib/udise_school_report_reader/enrollment_data_reader.rb +136 -0
data/lib/udise_school_report_reader/enrollment_html_writer.rb +81 -0
data/lib/udise_school_report_reader/enrollment_yaml_writer.rb +62 -0
data/lib/udise_school_report_reader/ews_data_reader.rb +118 -0
data/lib/udise_school_report_reader/ews_html_writer.rb +63 -0
data/lib/udise_school_report_reader/ews_yaml_writer.rb +31 -0
data/lib/udise_school_report_reader/location_data_reader.rb +47 -0
data/lib/udise_school_report_reader/official_data_reader.rb +40 -0
data/lib/udise_school_report_reader/pdf_block_extractor.rb +49 -0
data/lib/udise_school_report_reader/pdf_content_compressor.rb +36 -0
data/lib/udise_school_report_reader/pdf_rectangle_extractor.rb +53 -0
data/lib/udise_school_report_reader/rooms_data_reader.rb +36 -0
data/lib/udise_school_report_reader/rte_data_reader.rb +118 -0
data/lib/udise_school_report_reader/rte_html_writer.rb +63 -0
data/lib/udise_school_report_reader/rte_yaml_writer.rb +61 -0
data/lib/udise_school_report_reader/sanitation_data_reader.rb +56 -0
data/lib/udise_school_report_reader/school_report_parser.rb +295 -0
data/lib/udise_school_report_reader/teacher_data_reader.rb +204 -0
data/lib/udise_school_report_reader/version.rb +3 -0
data/lib/udise_school_report_reader.rb +41 -0
data/test/school_report_parser_test.rb +62 -0
metadata +165 -0

data/lib/udise_school_report_reader/enrollment_data_reader.rb ADDED Viewed

@@ -0,0 +1,136 @@
+class EnrollmentDataReader
+  ALL_CATEGORIES = [
+    SOCIAL_CATEGORIES = [
+      { key: 'gen', label: 'Gen' },
+      { key: 'sc', label: 'SC' },
+      { key: 'st', label: 'ST' },
+      { key: 'obc', label: 'OBC' }
+    ].freeze,
+    RELIGION_CATEGORIES = [
+      { key: 'musl', label: 'Musl' },
+      { key: 'chris', label: 'Chris' },
+      { key: 'sikh', label: 'Sikh' },
+      { key: 'budd', label: 'Budd' },
+      { key: 'parsi', label: 'Parsi' },
+      { key: 'jain', label: 'Jain' },
+      { key: 'others', label: 'Others' }
+    ].freeze,
+    OTHER_CATEGORIES = [
+      { key: 'aadh', label: 'Aadh' },
+      { key: 'bpl', label: 'BPL' },
+      { key: 'rept', label: 'Rept' },
+      { key: 'cwsn', label: 'CWSN' }
+    ].freeze,
+    AGE_CATEGORIES = (3..22).map do |age|
+      { key: "age_#{age}", label: age == 3 ? '>3' : age.to_s }
+    end.freeze,
+  ].flatten.freeze
+  def self.read(csv_path) = new(csv_path).read
+  def initialize(csv_path)
+    @csv_path = csv_path
+    @x_cutoff = 0
+    @category_y_coords = {}
+  end
+  def read
+    # Initialize arrays for different row types
+    grade_rows = []
+    bg_rows = []
+    category_rows = {}
+    ALL_CATEGORIES.each do |category|
+      category_rows[category[:key]] = []
+    end
+    # First pass to collect y-coordinates for categories
+    CSV.foreach(@csv_path, headers: true) do |row|
+      if row['page'] == '2' && (row['rect_x'].to_f - 27.0).abs < 5.0
+        ALL_CATEGORIES.each do |category|
+          if row['text'].downcase == category[:label].downcase
+            @category_y_coords[category[:key]] = row['rect_y'].to_f
+          end
+        end
+      end
+    end
+    CSV.foreach(@csv_path, headers: true) do |row|
+      if row['page'] == '2'
+        if row['text'] == "Total" && row['rect_y'].to_f == 778.0
+          @x_cutoff = row['rect_x'].to_f
+        end
+        if ['Pre-Pr', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII'].include?(row['text'])
+          if row['text_y'].to_f == 780.0
+            grade_rows << row
+          end
+        elsif ['B', 'G'].include?(row['text'])
+          if row['text_y'].to_f == 768.0
+            bg_rows << row
+          end
+        elsif row['text'] =~ /^\d+$/
+          y_coord = row['rect_y'].to_f
+          ALL_CATEGORIES.each do |category|
+            if @category_y_coords[category[:key]] && (y_coord - @category_y_coords[category[:key]]).abs < 5.0
+              category_rows[category[:key]] << row
+            end
+          end
+        end
+      end
+    end
+    return nil if grade_rows.empty?
+    # Sort and filter rows
+    [grade_rows, bg_rows].each do |rows|
+      rows.sort_by! { |row| row['text_x'].to_f }
+      rows.reject! { |row| row['text_x'].to_f >= @x_cutoff }
+    end
+    category_rows.values.each do |rows|
+      rows.sort_by! { |row| row['text_x'].to_f }
+      rows.reject! { |row| row['text_x'].to_f >= @x_cutoff }
+    end
+    # Group B,G pairs
+    bg_pairs = bg_rows.each_slice(2).map do |b, g|
+      x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
+      [x_mid, [b, g]]
+    end.to_h
+    # Match numbers to pairs
+    result = {
+      grade_rows: grade_rows,
+      bg_pairs: bg_pairs
+    }
+    ALL_CATEGORIES.each do |category|
+      result["#{category[:key]}_numbers".to_sym] = match_numbers_to_pairs(category_rows[category[:key]], bg_pairs)
+    end
+    result
+  end
+  private
+    def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
+      numbers = {}
+      remaining = remaining_numbers.dup
+      bg_pairs.each do |x_mid, bg_pair|
+        b_x = bg_pair[0]['text_x'].to_f
+        g_x = bg_pair[1]['text_x'].to_f
+        # Find numbers closest to B and G positions
+        b_num = remaining.find { |row| (row['text_x'].to_f - b_x).abs < threshold }
+        remaining.delete(b_num) if b_num
+        g_num = remaining.find { |row| (row['text_x'].to_f - g_x).abs < threshold }
+        remaining.delete(g_num) if g_num
+        numbers[x_mid] = [b_num, g_num]
+      end
+      numbers
+    end
+end

data/lib/udise_school_report_reader/enrollment_html_writer.rb ADDED Viewed

@@ -0,0 +1,81 @@
+class EnrollmentHtmlWriter
+  def self.generate_html(data, html_path)
+    return unless data
+    grade_rows = data[:grade_rows]
+    bg_pairs = data[:bg_pairs]
+    categories = [
+      ['Gen', data[:gen_numbers]],
+      ['SC', data[:sc_numbers]],
+      ['ST', data[:st_numbers]],
+      ['OBC', data[:obc_numbers]],
+      ['Muslim', data[:musl_numbers]],
+      ['Christian', data[:chris_numbers]],
+      ['Sikh', data[:sikh_numbers]],
+      ['Buddhist', data[:budd_numbers]],
+      ['Parsi', data[:parsi_numbers]],
+      ['Jain', data[:jain_numbers]],
+      ['Others', data[:others_numbers]],
+      ['Aadhaar', data[:aadh_numbers]],
+      ['BPL', data[:bpl_numbers]],
+      ['Repeater', data[:rept_numbers]],
+      ['CWSN', data[:cwsn_numbers]]
+    ]
+    ages = (3..22).map do |age|
+      ["Age #{age}", data[:"age_#{age}_numbers"]]
+    end
+    # Generate table rows for all categories and ages
+    table_rows = (categories + ages).map do |category, numbers|
+      cells = bg_pairs.map do |x_mid, _|
+        nums = numbers[x_mid]
+        b_num = nums&.first
+        g_num = nums&.last
+        "<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
+      end.join
+      "    <tr>\n" \
+      "      <td class=\"category\">#{category}</td>\n" \
+      "      #{cells}\n" \
+      "    </tr>"
+    end.join("\n")
+    # Generate grade headers
+    grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
+    bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
+    html_content = <<~HTML
+      <!DOCTYPE html>
+      <html>
+      <head>
+        <title>Enrollment Table</title>
+        <style>
+          table { border-collapse: collapse; margin-top: 20px; width: 100%; }
+          th, td { border: 1px solid black; padding: 8px; text-align: center; }
+          .header { font-weight: bold; background-color: #f0f0f0; }
+          .grade { font-weight: bold; background-color: #e0e0e0; }
+          .bg-pair { background-color: #f8f8f8; }
+          .category { font-weight: bold; text-align: left; }
+        </style>
+      </head>
+      <body>
+        <h2>Enrolment (By Social Category)</h2>
+        <table>
+          <tr class="grade">
+            <th rowspan="2">Category</th>
+            #{grade_headers}
+          </tr>
+          <tr class="bg-pair">
+            #{bg_headers}
+          </tr>
+      #{table_rows}
+        </table>
+      </body>
+      </html>
+    HTML
+    File.write(html_path, html_content)
+  end
+end

data/lib/udise_school_report_reader/enrollment_yaml_writer.rb ADDED Viewed

@@ -0,0 +1,62 @@
+class EnrollmentYamlWriter
+  def self.format_yaml(data)
+    return unless data
+    grade_rows = data[:grade_rows]
+    bg_pairs = data[:bg_pairs]
+    categories = {
+      'gen' => data[:gen_numbers],
+      'sc' => data[:sc_numbers],
+      'st' => data[:st_numbers],
+      'obc' => data[:obc_numbers],
+      'muslim' => data[:musl_numbers],
+      'christian' => data[:chris_numbers],
+      'sikh' => data[:sikh_numbers],
+      'buddhist' => data[:budd_numbers],
+      'parsi' => data[:parsi_numbers],
+      'jain' => data[:jain_numbers],
+      'others' => data[:others_numbers],
+      'aadhaar' => data[:aadh_numbers],
+      'bpl' => data[:bpl_numbers],
+      'repeater' => data[:rept_numbers],
+      'cwsn' => data[:cwsn_numbers]
+    }
+    yaml_data = {}
+    categories.each do |category, numbers|
+      yaml_data[category] = {}
+      bg_pairs.each_with_index do |(x_mid, _), index|
+        next unless grade_rows[index] && grade_rows[index]['text']
+        grade_name = grade_rows[index]['text'].downcase.gsub(' ', '_')
+        nums = numbers&.[](x_mid)
+        boys_text = nums&.first&.[]('text')&.strip
+        girls_text = nums&.last&.[]('text')&.strip
+        yaml_data[category][grade_name] = {
+          'boys' => boys_text.nil? || boys_text.empty? ? nil : boys_text.to_i,
+          'girls' => girls_text.nil? || girls_text.empty? ? nil : girls_text.to_i
+        }
+      end
+    end
+    # Handle age data
+    (3..22).each do |age|
+      age_numbers = data[:"age_#{age}_numbers"]
+      yaml_data["age_#{age}"] = {}
+      bg_pairs.each_with_index do |(x_mid, _), index|
+        next unless grade_rows[index] && grade_rows[index]['text']
+        grade_name = grade_rows[index]['text'].downcase.gsub(' ', '_')
+        nums = age_numbers&.[](x_mid)
+        boys_text = nums&.first&.[]('text')&.strip
+        girls_text = nums&.last&.[]('text')&.strip
+        yaml_data["age_#{age}"][grade_name] = {
+          'boys' => boys_text.nil? || boys_text.empty? ? nil : boys_text.to_i,
+          'girls' => girls_text.nil? || girls_text.empty? ? nil : girls_text.to_i
+        }
+      end
+    end
+    yaml_data
+  end
+end

data/lib/udise_school_report_reader/ews_data_reader.rb ADDED Viewed

@@ -0,0 +1,118 @@
+class EwsDataReader
+  GRADES = [
+    'Pre-Pri.', 'Class I', 'Class II', 'Class III', 'Class IV', 'Class V',
+    'Class VI', 'Class VII', 'Class VIII', 'Class IX', 'Class X', 'Class XI', 'Class XII'
+  ]
+  def self.read(csv_path) = new(csv_path).read
+  def initialize(csv_path)
+    @csv_path = csv_path
+    @rows = Hash.new { |h, k| h[k] = [] }
+    # Group cells by rect_y and rect_x
+    CSV.foreach(@csv_path, headers: true) do |cell|
+      next unless cell['page'] == '1'
+      rect_y = cell['rect_y'].to_f
+      @rows[rect_y] << cell
+    end
+    # Find the title row
+    @title_row = @rows.find { |_, cells| cells.any? { |cell| cell&.dig('text')&.include?('Total no. of Economically Weaker Section*(EWS) students Enrolled in Schools') } }
+    title_y = @title_row&.first
+    return unless title_y
+    # Get all rows below title in descending order
+    rows_after_title = @rows.select { |y, _| y < title_y.to_f }
+                           .sort_by(&:first)
+                           .reverse
+    # Get the next 3 rows after title
+    return unless rows_after_title.size >= 3
+    @grades_row = rows_after_title[0].last
+    @bg_row = rows_after_title[1].last
+    @values_row = rows_after_title[2].last
+    # Sort cells within each row by x coordinate
+    [@grades_row, @bg_row].each do |row|
+      next unless row
+      row.sort_by! { |cell| cell['text_x'].to_f }
+    end
+    # For values row, ensure we have a value for each B/G pair
+    if @values_row && @bg_row
+      sorted_values = []
+      @bg_row.each_slice(2) do |b, g|
+        b_x = b['text_x'].to_f
+        g_x = g['text_x'].to_f
+        # Find or create value for boys
+        b_val = @values_row.find { |cell| (cell['text_x'].to_f - b_x).abs < 10.0 }
+        b_val ||= { 'text' => '-', 'text_x' => b_x }
+        sorted_values << b_val
+        # Find or create value for girls
+        g_val = @values_row.find { |cell| (cell['text_x'].to_f - g_x).abs < 10.0 }
+        g_val ||= { 'text' => '-', 'text_x' => g_x }
+        sorted_values << g_val
+      end
+      @values_row = sorted_values
+    end
+    # Normalize empty values to "-"
+    @values_row&.each { |cell| cell['text'] = '-' if cell['text'].strip.empty? }
+    # Ensure we have all grades
+    found_grades = @grades_row.map { |cell| cell['text'] }
+    missing_grades = GRADES - found_grades
+    if missing_grades.any?
+      # Removed puts statement
+    end
+  end
+  def read
+    return nil unless @grades_row && @bg_row && @values_row
+    # Group B,G pairs, ensuring we have complete pairs
+    bg_pairs = {}
+    @bg_row.each_slice(2) do |pair|
+      next unless pair.size == 2 && pair[0] && pair[1]  # Skip incomplete pairs
+      b, g = pair
+      x_mid = (b['text_x'].to_f + g['text_x'].to_f) / 2
+      bg_pairs[x_mid] = [b, g]
+    end
+    # Match numbers to pairs
+    {
+      grade_rows: @grades_row,
+      bg_pairs: bg_pairs,
+      ews_numbers: match_numbers_to_pairs(@values_row, bg_pairs),
+    }
+  end
+  private
+    def match_numbers_to_pairs(remaining_numbers, bg_pairs, threshold = 10.0)
+      numbers = {}
+      remaining = remaining_numbers.dup
+      bg_pairs.each do |x_mid, bg_pair|
+        next unless bg_pair && bg_pair.size == 2  # Skip invalid pairs
+        b_x = bg_pair[0]['text_x'].to_f
+        g_x = bg_pair[1]['text_x'].to_f
+        # Find numbers closest to B and G positions
+        b_num = remaining.find { |cell| (cell['text_x'].to_f - b_x).abs < threshold }
+        remaining.delete(b_num) if b_num
+        g_num = remaining.find { |cell| (cell['text_x'].to_f - g_x).abs < threshold }
+        remaining.delete(g_num) if g_num
+        numbers[x_mid] = [b_num, g_num]
+      end
+      numbers
+    end
+end

data/lib/udise_school_report_reader/ews_html_writer.rb ADDED Viewed

@@ -0,0 +1,63 @@
+class EwsHtmlWriter
+  def self.generate_html(data, html_path)
+    return unless data
+    grade_rows = data[:grade_rows]
+    bg_pairs = data[:bg_pairs]
+    categories = [
+      ['EWS', data[:ews_numbers]],
+    ]
+    # Generate table rows for all categories
+    table_rows = categories.map do |category, numbers|
+      cells = bg_pairs.map do |x_mid, _|
+        nums = numbers[x_mid]
+        b_num = nums&.first
+        g_num = nums&.last
+        "<td>#{b_num ? b_num['text'] : ''}</td><td>#{g_num ? g_num['text'] : ''}</td>"
+      end.join
+      "    <tr>\n" \
+      "      <td class=\"category\">#{category}</td>\n" \
+      "      #{cells}\n" \
+      "    </tr>"
+    end.join("\n")
+    # Generate grade headers
+    grade_headers = grade_rows.map { |row| "<th colspan='2'>#{row['text']}</th>" }.join
+    bg_headers = grade_rows.map { |_| "<td>B</td><td>G</td>" }.join
+    html_content = <<~HTML
+      <!DOCTYPE html>
+      <html>
+      <head>
+        <title>Enrollment Table</title>
+        <style>
+          table { border-collapse: collapse; margin-top: 20px; width: 100%; }
+          th, td { border: 1px solid black; padding: 8px; text-align: center; }
+          .header { font-weight: bold; background-color: #f0f0f0; }
+          .grade { font-weight: bold; background-color: #e0e0e0; }
+          .bg-pair { background-color: #f8f8f8; }
+          .category { font-weight: bold; text-align: left; }
+        </style>
+      </head>
+      <body>
+        <h2>Enrolment (By Social Category)</h2>
+        <table>
+          <tr class="grade">
+            <th rowspan="2">Category</th>
+            #{grade_headers}
+          </tr>
+          <tr class="bg-pair">
+            #{bg_headers}
+          </tr>
+      #{table_rows}
+        </table>
+      </body>
+      </html>
+    HTML
+    File.write(html_path, html_content)
+  end
+end

data/lib/udise_school_report_reader/ews_yaml_writer.rb ADDED Viewed

@@ -0,0 +1,31 @@
+class EwsYamlWriter
+  def self.format_yaml(data)
+    return unless data
+    grade_rows = data[:grade_rows]
+    bg_pairs = data[:bg_pairs]
+    categories = {
+      'ews' => data[:ews_numbers],
+    }
+    yaml_data = {}
+    categories.each do |category, numbers|
+      yaml_data[category] = {}
+      bg_pairs.each_with_index do |(x_mid, _), index|
+        next unless grade_rows[index] && grade_rows[index]['text']
+        grade_name = grade_rows[index]['text'].downcase.gsub(' ', '_')
+        nums = numbers&.[](x_mid)
+        boys_text = nums&.first&.[]('text')&.strip
+        girls_text = nums&.last&.[]('text')&.strip
+        yaml_data[category][grade_name] = {
+          'boys' => boys_text.nil? || boys_text.empty? ? nil : boys_text.to_i,
+          'girls' => girls_text.nil? || girls_text.empty? ? nil : girls_text.to_i
+        }
+      end
+    end
+    yaml_data
+  end
+end

data/lib/udise_school_report_reader/location_data_reader.rb ADDED Viewed

@@ -0,0 +1,47 @@
+require_relative 'data_reader_base'
+class LocationDataReader
+  include DataReaderBase
+  FIELD_MAPPINGS = {
+    'State' => {
+      key_path: ['location', 'state'],
+      end_pattern: /District/
+    },
+    'District' => {
+      key_path: ['location', 'district'],
+      end_pattern: /Block/
+    },
+    'Block' => {
+      key_path: ['location', 'block'],
+      end_pattern: /Rural/
+    },
+    'Rural / Urban' => {
+      key_path: ['location', 'area_type'],
+      end_pattern: /Cluster/
+    },
+    'Pincode' => {
+      key_path: ['location', 'pincode']
+    },
+    'Ward' => {
+      key_path: ['location', 'ward'],
+      end_pattern: /Mohalla/
+    },
+    'Cluster' => {
+      key_path: ['location', 'cluster'],
+      end_pattern: /Ward/
+    },
+    'Municipality' => {
+      key_path: ['location', 'municipality'],
+      end_pattern: /Assembly/
+    },
+    'Assembly Const.' => {
+      key_path: ['location', 'assembly_constituency'],
+      end_pattern: /Parl/
+    },
+    'Parl. Constituency' => {
+      key_path: ['location', 'parliamentary_constituency'],
+      end_pattern: /School/
+    }
+  }
+end

data/lib/udise_school_report_reader/official_data_reader.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require_relative 'data_reader_base'
+class OfficialDataReader
+  include DataReaderBase
+  FIELD_MAPPINGS = {
+    'Year of Establishment' => {
+      key_path: ['official', 'established'],
+      value_type: :integer
+    },
+    'Year of Recognition-Pri.' => {
+      key_path: ['official', 'recognition', 'primary'],
+      value_type: :integer
+    },
+    'Year of Recognition-Upr.Pri.' => {
+      key_path: ['official', 'recognition', 'upper_primary'],
+      value_type: :integer
+    },
+    'Year of Recognition-Sec.' => {
+      key_path: ['official', 'recognition', 'secondary'],
+      value_type: :integer
+    },
+    'Year of Recognition-Higher Sec.' => {
+      key_path: ['official', 'recognition', 'higher_secondary'],
+      value_type: :integer
+    },
+    'Affiliation Board-Sec' => {
+      key_path: ['official', 'affiliation', 'secondary'],
+      end_pattern: /Affiliation Board-HSec/
+    },
+    'Affiliation Board-HSec' => {
+      key_path: ['official', 'affiliation', 'higher_secondary'],
+      end_pattern: /Is this/
+    },
+    'School Management' => {
+      key_path: ['official', 'management'],
+      end_pattern: /School Type/
+    }
+  }
+end

data/lib/udise_school_report_reader/pdf_block_extractor.rb ADDED Viewed

@@ -0,0 +1,49 @@
+class PDFBlockExtractor
+  def self.extract_blocks(reader)
+    blocks = []
+    reader.pages.each_with_index do |page, index|
+      page_number = index + 1
+      current_block = {}
+      page.raw_content.each_line do |line|
+        if line.include?('BT')
+          current_block = {
+            page: page_number,
+            start_line: line.strip,
+            text: []  # Initialize as array to collect multiple text blocks
+          }
+        elsif line.match?(/1\s+0\s+0\s+1\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+Tm/)
+          # Only set coordinates if not already set
+          unless current_block[:x] && current_block[:y]
+            matches = line.match(/1\s+0\s+0\s+1\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+Tm/)
+            current_block[:x] = matches[1].to_f
+            current_block[:y] = matches[2].to_f
+          end
+        elsif line.match?(/\/F(\d+)\s+(\d+(\.\d+)?)\s+Tf/)
+          # Only set font if not already set
+          unless current_block[:font] && current_block[:font_size]
+            matches = line.match(/\/F(\d+)\s+(\d+(\.\d+)?)\s+Tf/)
+            current_block[:font] = "F#{matches[1]}"
+            current_block[:font_size] = matches[2].to_f
+          end
+        elsif line.match?(/\((.*?)\)\s*Tj/)
+          # Collect all text blocks, remove escape characters
+          text = line.match(/\((.*?)\)\s*Tj/)[1]
+          text = text.gsub(/\\/, '') # Remove escape characters
+          current_block[:text] << text
+        elsif line.include?('ET')
+          current_block[:end_line] = line.strip
+          # Join all text blocks with space
+          current_block[:text] = current_block[:text].join(' ')
+          # Only add non-empty blocks with coordinates
+          if !current_block[:text].empty? && current_block[:x] && current_block[:y]
+            blocks << current_block.dup
+          end
+        end
+      end
+    end
+    blocks
+  end
+end

data/lib/udise_school_report_reader/pdf_content_compressor.rb ADDED Viewed

@@ -0,0 +1,36 @@
+class PDFContentCompressor
+  def self.compress(content)
+    compressed = []
+    current_block = []
+    in_bt_block = false
+    current_text = ""
+    content.each_line do |line|
+      if line.include?('BT')
+        in_bt_block = true
+        current_block = []
+        current_text = ""
+      elsif line.include?('ET')
+        in_bt_block = false
+        current_text = current_block.join("")
+        compressed << current_text unless current_text.empty?
+      elsif in_bt_block && line =~ /\((.*?)\)\s*Tj/
+        # Extract text between (...) followed by Tj
+        text = $1.strip
+        if text =~ /^(?:Non|Residenti|al|Digit|al Facil|ities)$/
+          # Special handling for split text
+          current_text += text
+          current_block << text
+        else
+          if !current_text.empty?
+            compressed << current_text
+          end
+          current_text = text
+          current_block = [text]
+        end
+      end
+    end
+    compressed.reject(&:empty?).join("\n")
+  end
+end