rabbit-slide-kou-nagoya-rubykaigi-03 2017.2.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rabbit +1 -0
- data/README.rd +81 -0
- data/Rakefile +18 -0
- data/apache-arrow-gi-based-ruby-bindings.rab +766 -0
- data/config.yaml +27 -0
- data/data/hebi.svg +1108 -0
- data/data/hebi_blue.svg +1110 -0
- data/data/panda_kotatsu.svg +78 -0
- data/examples/data/bow.data.tf.filtered +0 -0
- data/examples/data/bow.data.tf.raw +0 -0
- data/examples/data/bow.data.tfidf.filtered +0 -0
- data/examples/data/bow.data.tfidf.raw +0 -0
- data/examples/data/bow.metadata.tf.filtered +1 -0
- data/examples/data/bow.metadata.tf.raw +1 -0
- data/examples/data/bow.metadata.tfidf.filtered +1 -0
- data/examples/data/bow.metadata.tfidf.raw +1 -0
- data/examples/data/topics.tf.filtered +0 -0
- data/examples/data/topics.tf.raw +0 -0
- data/examples/data/topics.tfidf.filtered +0 -0
- data/examples/data/topics.tfidf.raw +0 -0
- data/examples/estimate-topics.py +72 -0
- data/examples/raw-show-related-terms.rb +38 -0
- data/examples/raw-write-bow.rb +124 -0
- data/examples/read.rb +30 -0
- data/examples/run.sh +33 -0
- data/examples/show-related-terms.rb +39 -0
- data/examples/show.sh +20 -0
- data/examples/write-bow.rb +96 -0
- data/examples/write.py +15 -0
- data/images/clear-code.svg +161 -0
- data/images/copy-data-between-system.png +0 -0
- data/images/groonga-logo.svg +118 -0
- data/images/rroonga-logo.svg +117 -0
- data/images/share-data-between-system.png +0 -0
- data/images/system-with-ruby.png +0 -0
- data/pdf/nagoya-rubykaigi-03-apache-arrow-gi-based-ruby-bindings.pdf +0 -0
- data/theme.rb +6 -0
- metadata +113 -0
@@ -0,0 +1,78 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<!-- Generator: Adobe Illustrator 15.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0) -->
|
3
|
+
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
4
|
+
<svg version="1.1" id="レイヤー_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px"
|
5
|
+
y="0px" width="208.583px" height="84.229px" viewBox="0 0 208.583 84.229" enable-background="new 0 0 208.583 84.229"
|
6
|
+
xml:space="preserve">
|
7
|
+
<path fill="#070001" d="M166.844,51.551c0,0,22.529,5.493,27.457,4.416c4.928-1.076,12.331,1.57,12.592,6.651
|
8
|
+
c0.547,10.658-21.068,4.085-40.164-4.369S166.844,51.551,166.844,51.551z"/>
|
9
|
+
<path fill="#FBFAE8" stroke="#060001" stroke-miterlimit="10" d="M139.649,21.6c-0.25-1.75-14.215-9.95-27.25-10.25
|
10
|
+
c-21.75-0.5-27.114,20.586-29.25,34c-0.668,4.197-5.213,10.727-7.687,11.82c-3.067,1.357-3.91,3.32-2.447,4.14
|
11
|
+
c2.438,1.366,37.634,7.29,77.384-3.21C138.899,39.35,139.649,21.6,139.649,21.6z"/>
|
12
|
+
<path fill="#070001" d="M106.943,12.473c-0.174-2.576-4.785-7.249-7.797-3.365c-3.014,3.884-2.32,5.065-0.377,6.572
|
13
|
+
C100.711,17.186,107.285,17.564,106.943,12.473z"/>
|
14
|
+
<path fill="#FBFAE8" stroke="#080103" stroke-width="2" stroke-miterlimit="10" d="M190.399,55.85c6.013-14.03,0-50-25-52
|
15
|
+
s-34,13-36,26s14,34,28,37S187.399,62.85,190.399,55.85z"/>
|
16
|
+
<path fill="#070001" d="M86.064,63.631c-7.491-0.821-13.198-1.123-14.084-1.619c-1.463-0.82,0.227-3.556,3.294-4.912
|
17
|
+
c2.244-0.992,4.75-4.125,6.5-8C81.537,52.529,82.887,60.837,86.064,63.631z"/>
|
18
|
+
<path fill="#070001" d="M186.399,21.264c4.852-3.508,1.593-14.713-5-14.164c-9.25,0.771-9,6.029-6,10.368
|
19
|
+
C178.898,22.526,183.399,23.433,186.399,21.264z"/>
|
20
|
+
<path fill="#070001" d="M196.899,37.6c1.265-2.53-0.22-8.556-5.5-7.5c-2.5,0.5-0.75,3.058-0.75,5.75
|
21
|
+
C190.649,39.1,194.399,42.6,196.899,37.6z"/>
|
22
|
+
<path fill="#070001" d="M186.716,52.486c1.392-2.179-1.573-7.141-5.144-6.354s-1.18,5.688,1.06,6.505
|
23
|
+
C184.87,53.454,186.716,52.486,186.716,52.486z"/>
|
24
|
+
<path fill="#070001" d="M176.217,36.27c1.641-3.109-1.977-12.587-8.17-10.74c-7.2,2.148-0.514,10.408,1.726,11.225
|
25
|
+
C172.011,37.571,174.765,39.023,176.217,36.27z"/>
|
26
|
+
<path fill="#DB698C" d="M159.466,42.644c0.479-0.87,7.156,2.642,13.117,3.996c0.071,4.148,2.923,13.869,2.223,14.086
|
27
|
+
c-1.019,0.474-3.349-4.081-6.556-7.665C165.042,49.479,158.986,43.514,159.466,42.644z"/>
|
28
|
+
<path fill="#070001" d="M176.371,47.823c1.012-1.585-1.146-5.194-3.741-4.622c-2.597,0.573-0.857,4.136,0.771,4.731
|
29
|
+
C175.029,48.527,176.371,47.823,176.371,47.823z"/>
|
30
|
+
<path fill="#070001" d="M131.899,21.85c-0.5,13,5.417,32.583,15.25,40.75c7.289,6.054,16,7.75,14,13.75s-13.834,7.584-21.833-0.75
|
31
|
+
C132.071,68.053,122.149,34.35,131.899,21.85z"/>
|
32
|
+
<g>
|
33
|
+
<path fill="#EDE6CD" d="M115.6,83.201c-3.067,0-12.861-1.058-22.333-2.08c-8.52-0.92-17.329-1.871-19.742-1.871
|
34
|
+
c-1.398,0-4.595,0.29-8.643,0.656c-6.615,0.6-15.674,1.42-22.3,1.42c-3.993,0-6.667-0.293-8.178-0.897
|
35
|
+
C21.229,75.158,2.573,52.935,0.034,43.98c-2.013-7.1,1.647-10.246,5.521-13.576c2.168-1.863,4.625-3.975,6.489-7.048
|
36
|
+
c1.47-2.424,2.82-4.867,4.125-7.23c3.748-6.783,6.708-12.141,10.163-12.77c2.458-0.447,10.359-0.664,24.156-0.664
|
37
|
+
c22.211,0,52.131,0.595,54.616,0.645c0.991-0.292,2.493-0.441,4.469-0.441c6.996,0,20.62,2.044,26.955,5.739
|
38
|
+
c5.885,3.433,11.121,18.022,11.121,27.114c0,7.64,2.998,15.207,9.164,23.136c1.505,1.935,2.045,4.46,1.521,7.111
|
39
|
+
c-0.764,3.86-3.703,7.504-8.063,9.996C141.409,81.057,121.878,83.201,115.6,83.201z"/>
|
40
|
+
<path fill="#50482E" d="M50.489,3.693c23.329,0,54.758,0.648,54.758,0.648c0.901-0.3,2.419-0.444,4.326-0.444
|
41
|
+
c7.353,0,20.496,2.129,26.451,5.603c5.418,3.161,10.625,17.233,10.625,26.25c0,10,5,18.125,9.375,23.75
|
42
|
+
c3.165,4.069,1.45,11.225-6.25,15.625c-8.75,5-28.343,7.076-34.175,7.076S78.628,78.25,73.524,78.25
|
43
|
+
c-3.837,0-20.393,2.076-30.943,2.076c-3.48,0-6.305-0.226-7.807-0.826c-12.5-5-31.25-26.875-33.778-35.792
|
44
|
+
C-1.9,33.491,7.57,32.661,12.899,23.875c5.525-9.11,9.53-18.792,13.612-19.534C29.185,3.855,38.825,3.693,50.489,3.693
|
45
|
+
M50.492,1.693v2l-0.001-2c-13.859,0-21.819,0.223-24.336,0.68c-3.922,0.713-6.832,5.98-10.859,13.27
|
46
|
+
c-1.3,2.354-2.645,4.788-4.105,7.195c-1.781,2.937-4.071,4.905-6.285,6.808c-3.943,3.389-8.02,6.893-5.833,14.608
|
47
|
+
c2.586,9.119,21.563,31.744,34.96,37.104c1.653,0.661,4.37,0.969,8.549,0.969c6.672,0,15.757-0.823,22.391-1.424
|
48
|
+
c3.863-0.35,7.2-0.652,8.553-0.652c2.359,0,11.142,0.948,19.635,1.865c9.498,1.025,19.319,2.086,22.44,2.086
|
49
|
+
c6.354,0,26.144-2.184,35.167-7.34c4.612-2.636,7.729-6.524,8.548-10.67c0.582-2.939-0.026-5.752-1.711-7.919
|
50
|
+
c-6.025-7.747-8.954-15.113-8.954-22.522c0-9.543-5.36-24.328-11.617-27.978c-6.572-3.833-20.17-5.875-27.459-5.875
|
51
|
+
c-2.002,0-3.512,0.144-4.598,0.438C101.629,2.268,72.335,1.693,50.492,1.693L50.492,1.693z"/>
|
52
|
+
</g>
|
53
|
+
<path fill="#DFD2B4" d="M0.753,43.987c-2.94-10.371,6.674-11.214,12.083-20.133c5.609-9.248,9.675-19.077,13.818-19.83
|
54
|
+
c0.649-0.118,1.706-0.217,3.096-0.299l21.07,19.707c-8.461,20.641-10.847,34.102-11.506,46.02
|
55
|
+
c-0.309,5.582,2.985,8.004,5.626,10.744c-4.057,0.052-7.831,0.294-9.518-0.381C22.733,74.738,3.319,53.04,0.753,43.987z"/>
|
56
|
+
<path fill="#DFD2B4" d="M157.337,62.125c0.738,4.104-1.227,9.636-7.354,13.138c-8.968,5.125-28.66,6.873-34.637,6.929
|
57
|
+
c-3.411,0.032-8.264-0.592-16.27-1.473c23.169-1.239,37.861-4.521,49.712-9.23C153.932,69.442,155.812,65.644,157.337,62.125z"/>
|
58
|
+
<path fill="#DB2218" d="M146.645,47.98c-7.629,0.012-10.737-7.677-10.62-13.605c0.087-4.386,4.525-8.732,8.566-10.085
|
59
|
+
c1.273,4.085,2.059,8.34,2.059,11.835c0,4.25,1.001,8.038,2.375,11.563C148.294,47.79,147.447,47.979,146.645,47.98z"/>
|
60
|
+
<path fill="#DB2218" d="M124.889,12.367c-2.125-0.755-6.124,2.155-6.183,5.093c-0.058,2.938,1.482,6.749,5.264,6.743
|
61
|
+
c4.604-0.007,6.827-3.49,5.821-7.134S126.056,12.782,124.889,12.367z"/>
|
62
|
+
<path fill="#DB2218" d="M118.161,53.44c12.887,0,20.824,16.796,16.107,26.648c-7.217,1.526-15.329,2.113-18.64,2.113
|
63
|
+
c-1.528,0-5.839-0.214-10.419-0.854c-1.315-1.664-2.521-3.419-3.495-4.882C96.781,69.064,103.358,53.44,118.161,53.44z"/>
|
64
|
+
<path fill="#DB2218" d="M67.463,74.771c11.481-6.263,8.771-19.138,1.595-24.719c-7.176-5.582-19.089,5.059-19.934,14.353
|
65
|
+
C48.326,73.177,58.692,79.556,67.463,74.771z"/>
|
66
|
+
<path fill="#DB2218" d="M102.274,38.875c0-6.374-5.625-12.5-13.125-6.25s0.625,12.5,3.125,13.125S102.274,43.25,102.274,38.875z"/>
|
67
|
+
<path fill="#DB2218" d="M54.149,26.375c1.207,5.634-3.125,9.375-7.5,10.625s-19.375-3.125-10.625-16.25
|
68
|
+
C41.929,11.894,52.274,17.625,54.149,26.375z"/>
|
69
|
+
<path fill="#C61F1E" d="M7.613,29.643c1.416,0.671,2.786,1.708,4.045,3.196c7.782,9.197,3.864,17.439-4.244,21.223
|
70
|
+
c-0.294,0.137-0.583,0.257-0.87,0.368c-3-4.142-5.167-7.92-5.918-10.569C-1.502,36.355,2.944,33.839,7.613,29.643z"/>
|
71
|
+
<path fill="#C61F1E" d="M112.649,82.156c-1.752-0.128-3.811-0.349-3.82-0.349c-1.07-0.101-1.986-0.194-3.523-0.37
|
72
|
+
c-0.197-0.248-0.645-0.791-0.832-1.039c12.78-0.894,22.723-2.367,31.018-4.58c-0.195,1.512-0.633,3.033-1.266,4.355
|
73
|
+
c-0.17,0.107-5.879,1.047-11.999,1.582C119.203,82.02,116.356,82.428,112.649,82.156z"/>
|
74
|
+
<path fill="#C61F1E" d="M36.024,20.75c2.061-3.09,4.662-4.4,7.255-4.43l6.37,7.471c-0.894,5.897-2.957,8.532-4.046,13.383
|
75
|
+
C40.32,37.604,27.876,32.973,36.024,20.75z"/>
|
76
|
+
<path fill="#815C25" d="M100.165,0.55c4.374,0,29.452,20.08,29.452,24.204c0,2.916-75.819,2.187-80.923,0
|
77
|
+
C44.43,22.926,19.241,3.757,23.617,1.57C28.181-0.712,100.165,0.55,100.165,0.55z"/>
|
78
|
+
</svg>
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
{"n_documents":13181,"n_features":15914}
|
@@ -0,0 +1 @@
|
|
1
|
+
{"n_documents":14066,"n_features":15916}
|
@@ -0,0 +1 @@
|
|
1
|
+
{"n_documents":13181,"n_features":15914}
|
@@ -0,0 +1 @@
|
|
1
|
+
{"n_documents":14066,"n_features":15916}
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
import sys
|
4
|
+
sys.path.append("/home/kou/work/cpp/arrow/python")
|
5
|
+
|
6
|
+
import json
|
7
|
+
|
8
|
+
import scipy as sp
|
9
|
+
import pandas as pd
|
10
|
+
import pyarrow as A
|
11
|
+
from sklearn.decomposition import LatentDirichletAllocation
|
12
|
+
|
13
|
+
LDA = LatentDirichletAllocation
|
14
|
+
|
15
|
+
metadata_path = sys.argv[1]
|
16
|
+
data_path = sys.argv[2]
|
17
|
+
topics_path = sys.argv[3]
|
18
|
+
if len(sys.argv) >= 5:
|
19
|
+
n_documents = int(sys.argv[4])
|
20
|
+
else:
|
21
|
+
n_documents = None
|
22
|
+
if len(sys.argv) >= 6:
|
23
|
+
n_topics = int(sys.argv[5])
|
24
|
+
else:
|
25
|
+
n_topics = None
|
26
|
+
|
27
|
+
if n_topics is None or n_topics == -1:
|
28
|
+
n_topics = 100
|
29
|
+
|
30
|
+
with open(metadata_path) as metadata_file:
|
31
|
+
metadata = json.load(metadata_file)
|
32
|
+
if n_documents is None or n_documents == -1:
|
33
|
+
n_documents = metadata["n_documents"]
|
34
|
+
else:
|
35
|
+
n_documents = min(n_documents, metadata["n_documents"])
|
36
|
+
n_features = metadata["n_features"]
|
37
|
+
|
38
|
+
lda = LDA(n_topics=n_topics,
|
39
|
+
learning_method="online",
|
40
|
+
total_samples=n_documents,
|
41
|
+
n_jobs=1)
|
42
|
+
|
43
|
+
with A.io.MemoryMappedFile(data_path, "rb") as source:
|
44
|
+
reader = A.ipc.StreamReader(source)
|
45
|
+
for i, batch in enumerate(reader):
|
46
|
+
if i >= n_documents:
|
47
|
+
break
|
48
|
+
sys.stdout.write("\r%.3f%%" % ((i / n_documents) * 100))
|
49
|
+
df = batch.to_pandas()
|
50
|
+
corpus = sp.sparse.csr_matrix((df["score"].values,
|
51
|
+
df["term_id"].values,
|
52
|
+
[0, df["term_id"].size]),
|
53
|
+
shape=(1, n_features))
|
54
|
+
lda.partial_fit(corpus)
|
55
|
+
sys.stdout.write("\n")
|
56
|
+
|
57
|
+
def topic_to_df(topic):
|
58
|
+
n_top_terms = 10
|
59
|
+
return pd.DataFrame([[i, topic[i]]
|
60
|
+
for i in topic.argsort()[:-n_top_terms - 1:-1]],
|
61
|
+
columns=["term_id", "score"])
|
62
|
+
|
63
|
+
topic = lda.components_[0]
|
64
|
+
topic_df = topic_to_df(topic)
|
65
|
+
schema = A.RecordBatch.from_pandas(topic_df).schema
|
66
|
+
with open(topics_path, "wb") as sink:
|
67
|
+
writer = A.ipc.StreamWriter(sink, schema)
|
68
|
+
for topic in lda.components_:
|
69
|
+
topic_df = topic_to_df(topic)
|
70
|
+
topic_record_batch = A.RecordBatch.from_pandas(topic_df)
|
71
|
+
writer.write_batch(topic_record_batch)
|
72
|
+
writer.close()
|
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "groonga"
|
4
|
+
require "gi"
|
5
|
+
|
6
|
+
db_path = ARGV[0]
|
7
|
+
output_path = ARGV[1]
|
8
|
+
|
9
|
+
Arrow = GI.load("Arrow")
|
10
|
+
ArrowIO = GI.load("ArrowIO")
|
11
|
+
ArrowIPC = GI.load("ArrowIPC")
|
12
|
+
|
13
|
+
Groonga::Database.open(db_path)
|
14
|
+
terms = Groonga["Words"]
|
15
|
+
|
16
|
+
input_stream = ArrowIO::MemoryMappedFile.open(output_path, :read)
|
17
|
+
begin
|
18
|
+
reader = ArrowIPC::StreamReader.open(input_stream)
|
19
|
+
loop do
|
20
|
+
record_batch = reader.next_record_batch
|
21
|
+
break if record_batch.nil?
|
22
|
+
columns = record_batch.columns
|
23
|
+
related_terms = []
|
24
|
+
previous_score = nil
|
25
|
+
record_batch.n_rows.times do |i|
|
26
|
+
score = columns[1].get_value(i)
|
27
|
+
break if score < 0.1
|
28
|
+
previous_score ||= score
|
29
|
+
break if (previous_score - score) > (score / 2.0)
|
30
|
+
term = Groonga::Record.new(terms, columns[0].get_value(i)).key
|
31
|
+
related_terms << [term, score]
|
32
|
+
end
|
33
|
+
next if related_terms.size < 2
|
34
|
+
p related_terms
|
35
|
+
end
|
36
|
+
ensure
|
37
|
+
input_stream.close
|
38
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "groonga"
|
4
|
+
require "gi"
|
5
|
+
|
6
|
+
db_path = ARGV[0]
|
7
|
+
metadata_output_path = ARGV[1]
|
8
|
+
data_output_path = ARGV[2]
|
9
|
+
use_tfidf = (ARGV[3] != "tf")
|
10
|
+
use_filter = (ARGV[4] != "raw")
|
11
|
+
|
12
|
+
Arrow = GI.load("Arrow")
|
13
|
+
ArrowIO = GI.load("ArrowIO")
|
14
|
+
ArrowIPC = GI.load("ArrowIPC")
|
15
|
+
|
16
|
+
Groonga::Database.open(db_path)
|
17
|
+
|
18
|
+
Groonga::Schema.define do |schema|
|
19
|
+
schema.create_table("Words",
|
20
|
+
:type => :patricia_trie,
|
21
|
+
:key_type => "ShortText",
|
22
|
+
:default_tokenizer => "TokenMecab",
|
23
|
+
:normalizer => "NormalizerAuto") do |table|
|
24
|
+
table.index("Entries.document")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
n_entries = Groonga["Entries"].size
|
29
|
+
too_many_much_threshold = n_entries * 0.25
|
30
|
+
too_less_much_threshold = n_entries * 0.001
|
31
|
+
|
32
|
+
bow = {}
|
33
|
+
index = Groonga["Words.Entries_document"]
|
34
|
+
max_term_id = 0
|
35
|
+
index.table.open_cursor(:order_by => :id) do |table_cursor|
|
36
|
+
table_cursor.each do |term|
|
37
|
+
n_match_documents = index.estimate_size(term)
|
38
|
+
# p [term.key, n_match_documents, (n_match_documents / n_entries.to_f)]
|
39
|
+
if use_filter
|
40
|
+
if n_match_documents <= too_less_much_threshold
|
41
|
+
p [:skip, :too_less, term.key, n_match_documents]
|
42
|
+
next
|
43
|
+
end
|
44
|
+
if n_match_documents >= too_many_much_threshold
|
45
|
+
p [:skip, :too_many, term.key, n_match_documents]
|
46
|
+
next
|
47
|
+
end
|
48
|
+
end
|
49
|
+
max_term_id = [max_term_id, term.id].max
|
50
|
+
df = Math.log(n_entries.to_f / n_match_documents)
|
51
|
+
index.open_cursor(term.id,
|
52
|
+
:with_position => false) do |index_cursor|
|
53
|
+
index_cursor.each(:reuse_posting_object => true) do |posting|
|
54
|
+
next unless posting.record.version.key == "2.4.0"
|
55
|
+
bow[posting.record_id] ||= []
|
56
|
+
if use_tfidf
|
57
|
+
score = posting.term_frequency / df
|
58
|
+
else
|
59
|
+
score = posting.term_frequency
|
60
|
+
end
|
61
|
+
bow[posting.record_id] << [posting.term_id, score]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
File.open(metadata_output_path, "w") do |metadata_file|
|
68
|
+
metadata_file.puts({
|
69
|
+
"n_documents" => bow.size,
|
70
|
+
"n_features" => max_term_id,
|
71
|
+
}.to_json)
|
72
|
+
end
|
73
|
+
|
74
|
+
module Arrow
|
75
|
+
class ArrayBuilder
|
76
|
+
class << self
|
77
|
+
def build(values)
|
78
|
+
builder = new
|
79
|
+
values.each do |value|
|
80
|
+
builder.append(value)
|
81
|
+
end
|
82
|
+
builder.finish
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class UInt32Array
|
88
|
+
class << self
|
89
|
+
def new(values)
|
90
|
+
UInt32ArrayBuilder.build(values)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
class DoubleArray
|
96
|
+
class << self
|
97
|
+
def new(values)
|
98
|
+
DoubleArrayBuilder.build(values)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
output_stream = ArrowIO::FileOutputStream.open(data_output_path, false)
|
105
|
+
begin
|
106
|
+
term_id_field = Arrow::Field.new("term_id", Arrow::UInt32DataType.new)
|
107
|
+
score_field = Arrow::Field.new("score", Arrow::DoubleDataType.new)
|
108
|
+
schema = Arrow::Schema.new([term_id_field, score_field])
|
109
|
+
writer = ArrowIPC::StreamWriter.open(output_stream, schema)
|
110
|
+
begin
|
111
|
+
bow.each do |record_id, words|
|
112
|
+
term_ids = Arrow::UInt32Array.new(words.collect(&:first))
|
113
|
+
scores = Arrow::DoubleArray.new(words.collect(&:last))
|
114
|
+
record_batch = Arrow::RecordBatch.new(schema,
|
115
|
+
words.size,
|
116
|
+
[term_ids, scores])
|
117
|
+
writer.write_record_batch(record_batch)
|
118
|
+
end
|
119
|
+
ensure
|
120
|
+
writer.close
|
121
|
+
end
|
122
|
+
ensure
|
123
|
+
output_stream.close
|
124
|
+
end
|
data/examples/read.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "gi"
|
4
|
+
|
5
|
+
Arrow = GI.load("Arrow")
|
6
|
+
ArrowIO = GI.load("ArrowIO")
|
7
|
+
ArrowIPC = GI.load("ArrowIPC")
|
8
|
+
|
9
|
+
module Arrow
|
10
|
+
class Array
|
11
|
+
def [](i)
|
12
|
+
get_value(i)
|
13
|
+
end
|
14
|
+
|
15
|
+
include Enumerable
|
16
|
+
def each
|
17
|
+
length.times do |i|
|
18
|
+
yield(self[i])
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
file = ArrowIO::MemoryMappedFile.open("/tmp/xxx", :read)
|
25
|
+
reader = ArrowIPC::FileReader.open(file)
|
26
|
+
p reader.schema.fields.collect(&:name)
|
27
|
+
record_batch = reader.get_record_batch(0)
|
28
|
+
record_batch.n_rows.times do |i|
|
29
|
+
p record_batch.columns.collect {|column| column[i]}
|
30
|
+
end
|
data/examples/run.sh
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
|
3
|
+
export LD_LIBRARY_PATH=/tmp/local/lib:$LD_LIBRARY_PATH
|
4
|
+
export GI_TYPELIB_PATH=/tmp/local/lib/girepository-1.0:$GI_TYPELIB_PATH
|
5
|
+
|
6
|
+
base_dir=$(dirname $0)
|
7
|
+
data_dir=$base_dir/data
|
8
|
+
|
9
|
+
mkdir -p $data_dir
|
10
|
+
|
11
|
+
for score in tf tfidf; do
|
12
|
+
for filter in raw filtered; do
|
13
|
+
(
|
14
|
+
ruby \
|
15
|
+
-I ~/work/ruby/rarrow/lib \
|
16
|
+
-I ~/work/ruby/rroonga/lib \
|
17
|
+
-I ~/work/ruby/rroonga/ext/groonga \
|
18
|
+
$base_dir/write-bow.rb \
|
19
|
+
~/work/ruby/rurema-search/groonga-database/bitclust.db \
|
20
|
+
$data_dir/bow.metadata.$score.$filter \
|
21
|
+
$data_dir/bow.data.$score.$filter \
|
22
|
+
$score \
|
23
|
+
$filter &&
|
24
|
+
python \
|
25
|
+
$base_dir/estimate-topics.py \
|
26
|
+
$data_dir/bow.metadata.$score.$filter \
|
27
|
+
$data_dir/bow.data.$score.$filter \
|
28
|
+
$data_dir/topics.$score.$filter
|
29
|
+
) &
|
30
|
+
done
|
31
|
+
done
|
32
|
+
|
33
|
+
wait
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "groonga"
|
4
|
+
require "arrow"
|
5
|
+
|
6
|
+
db_path = ARGV[0]
|
7
|
+
topics_path = ARGV[1]
|
8
|
+
|
9
|
+
Groonga::Database.open(db_path)
|
10
|
+
terms = Groonga["Words"]
|
11
|
+
index = Groonga["Words.Entries_document"]
|
12
|
+
|
13
|
+
Arrow::IO::MemoryMappedFile.open(topics_path, :read) do |input_stream|
|
14
|
+
Arrow::IPC::StreamReader.open(input_stream) do |reader|
|
15
|
+
reader.each do |record_batch|
|
16
|
+
related_terms = []
|
17
|
+
previous_score = nil
|
18
|
+
# p :topic_raw
|
19
|
+
# record_batch.each do |record|
|
20
|
+
# term = Groonga::Record.new(terms, record["term_id"]).key
|
21
|
+
# p [record["term_id"], record["score"], term, index.estimate_size(term)]
|
22
|
+
# end
|
23
|
+
record_batch.each do |record|
|
24
|
+
score = record["score"]
|
25
|
+
break if score < 0.1
|
26
|
+
previous_score ||= score
|
27
|
+
break if (previous_score - score) > (previous_score / 2.0)
|
28
|
+
previous_score = score
|
29
|
+
term = Groonga::Record.new(terms, record["term_id"]).key
|
30
|
+
related_terms << [term, score]
|
31
|
+
end
|
32
|
+
next if related_terms.size < 2
|
33
|
+
p :topic
|
34
|
+
related_terms.each do |term|
|
35
|
+
p term
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|