rabbit-slide-kou-nagoya-rubykaigi-03 2017.2.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/.rabbit +1 -0
  3. data/README.rd +81 -0
  4. data/Rakefile +18 -0
  5. data/apache-arrow-gi-based-ruby-bindings.rab +766 -0
  6. data/config.yaml +27 -0
  7. data/data/hebi.svg +1108 -0
  8. data/data/hebi_blue.svg +1110 -0
  9. data/data/panda_kotatsu.svg +78 -0
  10. data/examples/data/bow.data.tf.filtered +0 -0
  11. data/examples/data/bow.data.tf.raw +0 -0
  12. data/examples/data/bow.data.tfidf.filtered +0 -0
  13. data/examples/data/bow.data.tfidf.raw +0 -0
  14. data/examples/data/bow.metadata.tf.filtered +1 -0
  15. data/examples/data/bow.metadata.tf.raw +1 -0
  16. data/examples/data/bow.metadata.tfidf.filtered +1 -0
  17. data/examples/data/bow.metadata.tfidf.raw +1 -0
  18. data/examples/data/topics.tf.filtered +0 -0
  19. data/examples/data/topics.tf.raw +0 -0
  20. data/examples/data/topics.tfidf.filtered +0 -0
  21. data/examples/data/topics.tfidf.raw +0 -0
  22. data/examples/estimate-topics.py +72 -0
  23. data/examples/raw-show-related-terms.rb +38 -0
  24. data/examples/raw-write-bow.rb +124 -0
  25. data/examples/read.rb +30 -0
  26. data/examples/run.sh +33 -0
  27. data/examples/show-related-terms.rb +39 -0
  28. data/examples/show.sh +20 -0
  29. data/examples/write-bow.rb +96 -0
  30. data/examples/write.py +15 -0
  31. data/images/clear-code.svg +161 -0
  32. data/images/copy-data-between-system.png +0 -0
  33. data/images/groonga-logo.svg +118 -0
  34. data/images/rroonga-logo.svg +117 -0
  35. data/images/share-data-between-system.png +0 -0
  36. data/images/system-with-ruby.png +0 -0
  37. data/pdf/nagoya-rubykaigi-03-apache-arrow-gi-based-ruby-bindings.pdf +0 -0
  38. data/theme.rb +6 -0
  39. metadata +113 -0
@@ -0,0 +1,78 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <!-- Generator: Adobe Illustrator 15.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) -->
3
+ <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
4
+ <svg version="1.1" id="レイヤー_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px"
5
+ y="0px" width="208.583px" height="84.229px" viewBox="0 0 208.583 84.229" enable-background="new 0 0 208.583 84.229"
6
+ xml:space="preserve">
7
+ <path fill="#070001" d="M166.844,51.551c0,0,22.529,5.493,27.457,4.416c4.928-1.076,12.331,1.57,12.592,6.651
8
+ c0.547,10.658-21.068,4.085-40.164-4.369S166.844,51.551,166.844,51.551z"/>
9
+ <path fill="#FBFAE8" stroke="#060001" stroke-miterlimit="10" d="M139.649,21.6c-0.25-1.75-14.215-9.95-27.25-10.25
10
+ c-21.75-0.5-27.114,20.586-29.25,34c-0.668,4.197-5.213,10.727-7.687,11.82c-3.067,1.357-3.91,3.32-2.447,4.14
11
+ c2.438,1.366,37.634,7.29,77.384-3.21C138.899,39.35,139.649,21.6,139.649,21.6z"/>
12
+ <path fill="#070001" d="M106.943,12.473c-0.174-2.576-4.785-7.249-7.797-3.365c-3.014,3.884-2.32,5.065-0.377,6.572
13
+ C100.711,17.186,107.285,17.564,106.943,12.473z"/>
14
+ <path fill="#FBFAE8" stroke="#080103" stroke-width="2" stroke-miterlimit="10" d="M190.399,55.85c6.013-14.03,0-50-25-52
15
+ s-34,13-36,26s14,34,28,37S187.399,62.85,190.399,55.85z"/>
16
+ <path fill="#070001" d="M86.064,63.631c-7.491-0.821-13.198-1.123-14.084-1.619c-1.463-0.82,0.227-3.556,3.294-4.912
17
+ c2.244-0.992,4.75-4.125,6.5-8C81.537,52.529,82.887,60.837,86.064,63.631z"/>
18
+ <path fill="#070001" d="M186.399,21.264c4.852-3.508,1.593-14.713-5-14.164c-9.25,0.771-9,6.029-6,10.368
19
+ C178.898,22.526,183.399,23.433,186.399,21.264z"/>
20
+ <path fill="#070001" d="M196.899,37.6c1.265-2.53-0.22-8.556-5.5-7.5c-2.5,0.5-0.75,3.058-0.75,5.75
21
+ C190.649,39.1,194.399,42.6,196.899,37.6z"/>
22
+ <path fill="#070001" d="M186.716,52.486c1.392-2.179-1.573-7.141-5.144-6.354s-1.18,5.688,1.06,6.505
23
+ C184.87,53.454,186.716,52.486,186.716,52.486z"/>
24
+ <path fill="#070001" d="M176.217,36.27c1.641-3.109-1.977-12.587-8.17-10.74c-7.2,2.148-0.514,10.408,1.726,11.225
25
+ C172.011,37.571,174.765,39.023,176.217,36.27z"/>
26
+ <path fill="#DB698C" d="M159.466,42.644c0.479-0.87,7.156,2.642,13.117,3.996c0.071,4.148,2.923,13.869,2.223,14.086
27
+ c-1.019,0.474-3.349-4.081-6.556-7.665C165.042,49.479,158.986,43.514,159.466,42.644z"/>
28
+ <path fill="#070001" d="M176.371,47.823c1.012-1.585-1.146-5.194-3.741-4.622c-2.597,0.573-0.857,4.136,0.771,4.731
29
+ C175.029,48.527,176.371,47.823,176.371,47.823z"/>
30
+ <path fill="#070001" d="M131.899,21.85c-0.5,13,5.417,32.583,15.25,40.75c7.289,6.054,16,7.75,14,13.75s-13.834,7.584-21.833-0.75
31
+ C132.071,68.053,122.149,34.35,131.899,21.85z"/>
32
+ <g>
33
+ <path fill="#EDE6CD" d="M115.6,83.201c-3.067,0-12.861-1.058-22.333-2.08c-8.52-0.92-17.329-1.871-19.742-1.871
34
+ c-1.398,0-4.595,0.29-8.643,0.656c-6.615,0.6-15.674,1.42-22.3,1.42c-3.993,0-6.667-0.293-8.178-0.897
35
+ C21.229,75.158,2.573,52.935,0.034,43.98c-2.013-7.1,1.647-10.246,5.521-13.576c2.168-1.863,4.625-3.975,6.489-7.048
36
+ c1.47-2.424,2.82-4.867,4.125-7.23c3.748-6.783,6.708-12.141,10.163-12.77c2.458-0.447,10.359-0.664,24.156-0.664
37
+ c22.211,0,52.131,0.595,54.616,0.645c0.991-0.292,2.493-0.441,4.469-0.441c6.996,0,20.62,2.044,26.955,5.739
38
+ c5.885,3.433,11.121,18.022,11.121,27.114c0,7.64,2.998,15.207,9.164,23.136c1.505,1.935,2.045,4.46,1.521,7.111
39
+ c-0.764,3.86-3.703,7.504-8.063,9.996C141.409,81.057,121.878,83.201,115.6,83.201z"/>
40
+ <path fill="#50482E" d="M50.489,3.693c23.329,0,54.758,0.648,54.758,0.648c0.901-0.3,2.419-0.444,4.326-0.444
41
+ c7.353,0,20.496,2.129,26.451,5.603c5.418,3.161,10.625,17.233,10.625,26.25c0,10,5,18.125,9.375,23.75
42
+ c3.165,4.069,1.45,11.225-6.25,15.625c-8.75,5-28.343,7.076-34.175,7.076S78.628,78.25,73.524,78.25
43
+ c-3.837,0-20.393,2.076-30.943,2.076c-3.48,0-6.305-0.226-7.807-0.826c-12.5-5-31.25-26.875-33.778-35.792
44
+ C-1.9,33.491,7.57,32.661,12.899,23.875c5.525-9.11,9.53-18.792,13.612-19.534C29.185,3.855,38.825,3.693,50.489,3.693
45
+ M50.492,1.693v2l-0.001-2c-13.859,0-21.819,0.223-24.336,0.68c-3.922,0.713-6.832,5.98-10.859,13.27
46
+ c-1.3,2.354-2.645,4.788-4.105,7.195c-1.781,2.937-4.071,4.905-6.285,6.808c-3.943,3.389-8.02,6.893-5.833,14.608
47
+ c2.586,9.119,21.563,31.744,34.96,37.104c1.653,0.661,4.37,0.969,8.549,0.969c6.672,0,15.757-0.823,22.391-1.424
48
+ c3.863-0.35,7.2-0.652,8.553-0.652c2.359,0,11.142,0.948,19.635,1.865c9.498,1.025,19.319,2.086,22.44,2.086
49
+ c6.354,0,26.144-2.184,35.167-7.34c4.612-2.636,7.729-6.524,8.548-10.67c0.582-2.939-0.026-5.752-1.711-7.919
50
+ c-6.025-7.747-8.954-15.113-8.954-22.522c0-9.543-5.36-24.328-11.617-27.978c-6.572-3.833-20.17-5.875-27.459-5.875
51
+ c-2.002,0-3.512,0.144-4.598,0.438C101.629,2.268,72.335,1.693,50.492,1.693L50.492,1.693z"/>
52
+ </g>
53
+ <path fill="#DFD2B4" d="M0.753,43.987c-2.94-10.371,6.674-11.214,12.083-20.133c5.609-9.248,9.675-19.077,13.818-19.83
54
+ c0.649-0.118,1.706-0.217,3.096-0.299l21.07,19.707c-8.461,20.641-10.847,34.102-11.506,46.02
55
+ c-0.309,5.582,2.985,8.004,5.626,10.744c-4.057,0.052-7.831,0.294-9.518-0.381C22.733,74.738,3.319,53.04,0.753,43.987z"/>
56
+ <path fill="#DFD2B4" d="M157.337,62.125c0.738,4.104-1.227,9.636-7.354,13.138c-8.968,5.125-28.66,6.873-34.637,6.929
57
+ c-3.411,0.032-8.264-0.592-16.27-1.473c23.169-1.239,37.861-4.521,49.712-9.23C153.932,69.442,155.812,65.644,157.337,62.125z"/>
58
+ <path fill="#DB2218" d="M146.645,47.98c-7.629,0.012-10.737-7.677-10.62-13.605c0.087-4.386,4.525-8.732,8.566-10.085
59
+ c1.273,4.085,2.059,8.34,2.059,11.835c0,4.25,1.001,8.038,2.375,11.563C148.294,47.79,147.447,47.979,146.645,47.98z"/>
60
+ <path fill="#DB2218" d="M124.889,12.367c-2.125-0.755-6.124,2.155-6.183,5.093c-0.058,2.938,1.482,6.749,5.264,6.743
61
+ c4.604-0.007,6.827-3.49,5.821-7.134S126.056,12.782,124.889,12.367z"/>
62
+ <path fill="#DB2218" d="M118.161,53.44c12.887,0,20.824,16.796,16.107,26.648c-7.217,1.526-15.329,2.113-18.64,2.113
63
+ c-1.528,0-5.839-0.214-10.419-0.854c-1.315-1.664-2.521-3.419-3.495-4.882C96.781,69.064,103.358,53.44,118.161,53.44z"/>
64
+ <path fill="#DB2218" d="M67.463,74.771c11.481-6.263,8.771-19.138,1.595-24.719c-7.176-5.582-19.089,5.059-19.934,14.353
65
+ C48.326,73.177,58.692,79.556,67.463,74.771z"/>
66
+ <path fill="#DB2218" d="M102.274,38.875c0-6.374-5.625-12.5-13.125-6.25s0.625,12.5,3.125,13.125S102.274,43.25,102.274,38.875z"/>
67
+ <path fill="#DB2218" d="M54.149,26.375c1.207,5.634-3.125,9.375-7.5,10.625s-19.375-3.125-10.625-16.25
68
+ C41.929,11.894,52.274,17.625,54.149,26.375z"/>
69
+ <path fill="#C61F1E" d="M7.613,29.643c1.416,0.671,2.786,1.708,4.045,3.196c7.782,9.197,3.864,17.439-4.244,21.223
70
+ c-0.294,0.137-0.583,0.257-0.87,0.368c-3-4.142-5.167-7.92-5.918-10.569C-1.502,36.355,2.944,33.839,7.613,29.643z"/>
71
+ <path fill="#C61F1E" d="M112.649,82.156c-1.752-0.128-3.811-0.349-3.82-0.349c-1.07-0.101-1.986-0.194-3.523-0.37
72
+ c-0.197-0.248-0.645-0.791-0.832-1.039c12.78-0.894,22.723-2.367,31.018-4.58c-0.195,1.512-0.633,3.033-1.266,4.355
73
+ c-0.17,0.107-5.879,1.047-11.999,1.582C119.203,82.02,116.356,82.428,112.649,82.156z"/>
74
+ <path fill="#C61F1E" d="M36.024,20.75c2.061-3.09,4.662-4.4,7.255-4.43l6.37,7.471c-0.894,5.897-2.957,8.532-4.046,13.383
75
+ C40.32,37.604,27.876,32.973,36.024,20.75z"/>
76
+ <path fill="#815C25" d="M100.165,0.55c4.374,0,29.452,20.08,29.452,24.204c0,2.916-75.819,2.187-80.923,0
77
+ C44.43,22.926,19.241,3.757,23.617,1.57C28.181-0.712,100.165,0.55,100.165,0.55z"/>
78
+ </svg>
@@ -0,0 +1 @@
1
+ {"n_documents":13181,"n_features":15914}
@@ -0,0 +1 @@
1
+ {"n_documents":14066,"n_features":15916}
@@ -0,0 +1 @@
1
+ {"n_documents":13181,"n_features":15914}
@@ -0,0 +1 @@
1
+ {"n_documents":14066,"n_features":15916}
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env python
2
+
3
+ import sys
4
+ sys.path.append("/home/kou/work/cpp/arrow/python")
5
+
6
+ import json
7
+
8
+ import scipy as sp
9
+ import pandas as pd
10
+ import pyarrow as A
11
+ from sklearn.decomposition import LatentDirichletAllocation
12
+
13
+ LDA = LatentDirichletAllocation
14
+
15
+ metadata_path = sys.argv[1]
16
+ data_path = sys.argv[2]
17
+ topics_path = sys.argv[3]
18
+ if len(sys.argv) >= 5:
19
+ n_documents = int(sys.argv[4])
20
+ else:
21
+ n_documents = None
22
+ if len(sys.argv) >= 6:
23
+ n_topics = int(sys.argv[5])
24
+ else:
25
+ n_topics = None
26
+
27
+ if n_topics is None or n_topics == -1:
28
+ n_topics = 100
29
+
30
+ with open(metadata_path) as metadata_file:
31
+ metadata = json.load(metadata_file)
32
+ if n_documents is None or n_documents == -1:
33
+ n_documents = metadata["n_documents"]
34
+ else:
35
+ n_documents = min(n_documents, metadata["n_documents"])
36
+ n_features = metadata["n_features"]
37
+
38
+ lda = LDA(n_topics=n_topics,
39
+ learning_method="online",
40
+ total_samples=n_documents,
41
+ n_jobs=1)
42
+
43
+ with A.io.MemoryMappedFile(data_path, "rb") as source:
44
+ reader = A.ipc.StreamReader(source)
45
+ for i, batch in enumerate(reader):
46
+ if i >= n_documents:
47
+ break
48
+ sys.stdout.write("\r%.3f%%" % ((i / n_documents) * 100))
49
+ df = batch.to_pandas()
50
+ corpus = sp.sparse.csr_matrix((df["score"].values,
51
+ df["term_id"].values,
52
+ [0, df["term_id"].size]),
53
+ shape=(1, n_features))
54
+ lda.partial_fit(corpus)
55
+ sys.stdout.write("\n")
56
+
57
+ def topic_to_df(topic):
58
+ n_top_terms = 10
59
+ return pd.DataFrame([[i, topic[i]]
60
+ for i in topic.argsort()[:-n_top_terms - 1:-1]],
61
+ columns=["term_id", "score"])
62
+
63
+ topic = lda.components_[0]
64
+ topic_df = topic_to_df(topic)
65
+ schema = A.RecordBatch.from_pandas(topic_df).schema
66
+ with open(topics_path, "wb") as sink:
67
+ writer = A.ipc.StreamWriter(sink, schema)
68
+ for topic in lda.components_:
69
+ topic_df = topic_to_df(topic)
70
+ topic_record_batch = A.RecordBatch.from_pandas(topic_df)
71
+ writer.write_batch(topic_record_batch)
72
+ writer.close()
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "groonga"
4
+ require "gi"
5
+
6
+ db_path = ARGV[0]
7
+ output_path = ARGV[1]
8
+
9
+ Arrow = GI.load("Arrow")
10
+ ArrowIO = GI.load("ArrowIO")
11
+ ArrowIPC = GI.load("ArrowIPC")
12
+
13
+ Groonga::Database.open(db_path)
14
+ terms = Groonga["Words"]
15
+
16
+ input_stream = ArrowIO::MemoryMappedFile.open(output_path, :read)
17
+ begin
18
+ reader = ArrowIPC::StreamReader.open(input_stream)
19
+ loop do
20
+ record_batch = reader.next_record_batch
21
+ break if record_batch.nil?
22
+ columns = record_batch.columns
23
+ related_terms = []
24
+ previous_score = nil
25
+ record_batch.n_rows.times do |i|
26
+ score = columns[1].get_value(i)
27
+ break if score < 0.1
28
+ previous_score ||= score
29
+ break if (previous_score - score) > (score / 2.0)
30
+ term = Groonga::Record.new(terms, columns[0].get_value(i)).key
31
+ related_terms << [term, score]
32
+ end
33
+ next if related_terms.size < 2
34
+ p related_terms
35
+ end
36
+ ensure
37
+ input_stream.close
38
+ end
@@ -0,0 +1,124 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "groonga"
4
+ require "gi"
5
+
6
+ db_path = ARGV[0]
7
+ metadata_output_path = ARGV[1]
8
+ data_output_path = ARGV[2]
9
+ use_tfidf = (ARGV[3] != "tf")
10
+ use_filter = (ARGV[4] != "raw")
11
+
12
+ Arrow = GI.load("Arrow")
13
+ ArrowIO = GI.load("ArrowIO")
14
+ ArrowIPC = GI.load("ArrowIPC")
15
+
16
+ Groonga::Database.open(db_path)
17
+
18
+ Groonga::Schema.define do |schema|
19
+ schema.create_table("Words",
20
+ :type => :patricia_trie,
21
+ :key_type => "ShortText",
22
+ :default_tokenizer => "TokenMecab",
23
+ :normalizer => "NormalizerAuto") do |table|
24
+ table.index("Entries.document")
25
+ end
26
+ end
27
+
28
+ n_entries = Groonga["Entries"].size
29
+ too_many_much_threshold = n_entries * 0.25
30
+ too_less_much_threshold = n_entries * 0.001
31
+
32
+ bow = {}
33
+ index = Groonga["Words.Entries_document"]
34
+ max_term_id = 0
35
+ index.table.open_cursor(:order_by => :id) do |table_cursor|
36
+ table_cursor.each do |term|
37
+ n_match_documents = index.estimate_size(term)
38
+ # p [term.key, n_match_documents, (n_match_documents / n_entries.to_f)]
39
+ if use_filter
40
+ if n_match_documents <= too_less_much_threshold
41
+ p [:skip, :too_less, term.key, n_match_documents]
42
+ next
43
+ end
44
+ if n_match_documents >= too_many_much_threshold
45
+ p [:skip, :too_many, term.key, n_match_documents]
46
+ next
47
+ end
48
+ end
49
+ max_term_id = [max_term_id, term.id].max
50
+ df = Math.log(n_entries.to_f / n_match_documents)
51
+ index.open_cursor(term.id,
52
+ :with_position => false) do |index_cursor|
53
+ index_cursor.each(:reuse_posting_object => true) do |posting|
54
+ next unless posting.record.version.key == "2.4.0"
55
+ bow[posting.record_id] ||= []
56
+ if use_tfidf
57
+ score = posting.term_frequency / df
58
+ else
59
+ score = posting.term_frequency
60
+ end
61
+ bow[posting.record_id] << [posting.term_id, score]
62
+ end
63
+ end
64
+ end
65
+ end
66
+
67
+ File.open(metadata_output_path, "w") do |metadata_file|
68
+ metadata_file.puts({
69
+ "n_documents" => bow.size,
70
+ "n_features" => max_term_id,
71
+ }.to_json)
72
+ end
73
+
74
+ module Arrow
75
+ class ArrayBuilder
76
+ class << self
77
+ def build(values)
78
+ builder = new
79
+ values.each do |value|
80
+ builder.append(value)
81
+ end
82
+ builder.finish
83
+ end
84
+ end
85
+ end
86
+
87
+ class UInt32Array
88
+ class << self
89
+ def new(values)
90
+ UInt32ArrayBuilder.build(values)
91
+ end
92
+ end
93
+ end
94
+
95
+ class DoubleArray
96
+ class << self
97
+ def new(values)
98
+ DoubleArrayBuilder.build(values)
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ output_stream = ArrowIO::FileOutputStream.open(data_output_path, false)
105
+ begin
106
+ term_id_field = Arrow::Field.new("term_id", Arrow::UInt32DataType.new)
107
+ score_field = Arrow::Field.new("score", Arrow::DoubleDataType.new)
108
+ schema = Arrow::Schema.new([term_id_field, score_field])
109
+ writer = ArrowIPC::StreamWriter.open(output_stream, schema)
110
+ begin
111
+ bow.each do |record_id, words|
112
+ term_ids = Arrow::UInt32Array.new(words.collect(&:first))
113
+ scores = Arrow::DoubleArray.new(words.collect(&:last))
114
+ record_batch = Arrow::RecordBatch.new(schema,
115
+ words.size,
116
+ [term_ids, scores])
117
+ writer.write_record_batch(record_batch)
118
+ end
119
+ ensure
120
+ writer.close
121
+ end
122
+ ensure
123
+ output_stream.close
124
+ end
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "gi"
4
+
5
+ Arrow = GI.load("Arrow")
6
+ ArrowIO = GI.load("ArrowIO")
7
+ ArrowIPC = GI.load("ArrowIPC")
8
+
9
+ module Arrow
10
+ class Array
11
+ def [](i)
12
+ get_value(i)
13
+ end
14
+
15
+ include Enumerable
16
+ def each
17
+ length.times do |i|
18
+ yield(self[i])
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ file = ArrowIO::MemoryMappedFile.open("/tmp/xxx", :read)
25
+ reader = ArrowIPC::FileReader.open(file)
26
+ p reader.schema.fields.collect(&:name)
27
+ record_batch = reader.get_record_batch(0)
28
+ record_batch.n_rows.times do |i|
29
+ p record_batch.columns.collect {|column| column[i]}
30
+ end
@@ -0,0 +1,33 @@
1
+ #!/bin/sh
2
+
3
+ export LD_LIBRARY_PATH=/tmp/local/lib:$LD_LIBRARY_PATH
4
+ export GI_TYPELIB_PATH=/tmp/local/lib/girepository-1.0:$GI_TYPELIB_PATH
5
+
6
+ base_dir=$(dirname $0)
7
+ data_dir=$base_dir/data
8
+
9
+ mkdir -p $data_dir
10
+
11
+ for score in tf tfidf; do
12
+ for filter in raw filtered; do
13
+ (
14
+ ruby \
15
+ -I ~/work/ruby/rarrow/lib \
16
+ -I ~/work/ruby/rroonga/lib \
17
+ -I ~/work/ruby/rroonga/ext/groonga \
18
+ $base_dir/write-bow.rb \
19
+ ~/work/ruby/rurema-search/groonga-database/bitclust.db \
20
+ $data_dir/bow.metadata.$score.$filter \
21
+ $data_dir/bow.data.$score.$filter \
22
+ $score \
23
+ $filter &&
24
+ python \
25
+ $base_dir/estimate-topics.py \
26
+ $data_dir/bow.metadata.$score.$filter \
27
+ $data_dir/bow.data.$score.$filter \
28
+ $data_dir/topics.$score.$filter
29
+ ) &
30
+ done
31
+ done
32
+
33
+ wait
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "groonga"
4
+ require "arrow"
5
+
6
+ db_path = ARGV[0]
7
+ topics_path = ARGV[1]
8
+
9
+ Groonga::Database.open(db_path)
10
+ terms = Groonga["Words"]
11
+ index = Groonga["Words.Entries_document"]
12
+
13
+ Arrow::IO::MemoryMappedFile.open(topics_path, :read) do |input_stream|
14
+ Arrow::IPC::StreamReader.open(input_stream) do |reader|
15
+ reader.each do |record_batch|
16
+ related_terms = []
17
+ previous_score = nil
18
+ # p :topic_raw
19
+ # record_batch.each do |record|
20
+ # term = Groonga::Record.new(terms, record["term_id"]).key
21
+ # p [record["term_id"], record["score"], term, index.estimate_size(term)]
22
+ # end
23
+ record_batch.each do |record|
24
+ score = record["score"]
25
+ break if score < 0.1
26
+ previous_score ||= score
27
+ break if (previous_score - score) > (previous_score / 2.0)
28
+ previous_score = score
29
+ term = Groonga::Record.new(terms, record["term_id"]).key
30
+ related_terms << [term, score]
31
+ end
32
+ next if related_terms.size < 2
33
+ p :topic
34
+ related_terms.each do |term|
35
+ p term
36
+ end
37
+ end
38
+ end
39
+ end